Project 4 Final

#Predicting survival of patients with heart failure. This is a classification problem.

#Import the dataset
library(readr)
heartFailure <- read_csv("C:/Users/dnred/OneDrive/Desktop/CS 583/heartFailure.csv")

## Rows: 299 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (13): age, anaemia, creatinine_phosphokinase, diabetes, ejection_fractio...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

View(heartFailure)

#Load the packages
library(mlbench)

## Warning: package 'mlbench' was built under R version 4.3.3

library(e1071)

## Warning: package 'e1071' was built under R version 4.3.3

library(caret)

## Warning: package 'caret' was built under R version 4.3.3

## Loading required package: ggplot2
## Loading required package: lattice

library(dendextend)

## Warning: package 'dendextend' was built under R version 4.3.3

## 
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## 
## Attaching package: 'dendextend'
## 
## The following object is masked from 'package:stats':
## 
##     cutree

library(cluster)
library(fpc)

## Warning: package 'fpc' was built under R version 4.3.3

library(clValid)

## Warning: package 'clValid' was built under R version 4.3.3

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.3.3

## corrplot 0.92 loaded

library(rpart)

## 
## Attaching package: 'rpart'
## 
## The following object is masked from 'package:dendextend':
## 
##     prune

library(rattle)

## Warning: package 'rattle' was built under R version 4.3.3

## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.3.3

library(RColorBrewer)
library(class)

## Warning: package 'class' was built under R version 4.3.3

library(gmodels)

## Warning: package 'gmodels' was built under R version 4.3.3

library(kernlab)

## 
## Attaching package: 'kernlab'
## 
## The following object is masked from 'package:ggplot2':
## 
##     alpha

library(C50)

## Warning: package 'C50' was built under R version 4.3.3

#Statistical Analysis
hf <- heartFailure
str(hf)

## spc_tbl_ [299 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ age                     : num [1:299] 75 55 65 50 65 90 75 60 65 80 ...
##  $ anaemia                 : num [1:299] 0 0 0 1 1 1 1 1 0 1 ...
##  $ creatinine_phosphokinase: num [1:299] 582 7861 146 111 160 ...
##  $ diabetes                : num [1:299] 0 0 0 0 1 0 0 1 0 0 ...
##  $ ejection_fraction       : num [1:299] 20 38 20 20 20 40 15 60 65 35 ...
##  $ high_blood_pressure     : num [1:299] 1 0 0 0 0 1 0 0 0 1 ...
##  $ platelets               : num [1:299] 265000 263358 162000 210000 327000 ...
##  $ serum_creatinine        : num [1:299] 1.9 1.1 1.3 1.9 2.7 2.1 1.2 1.1 1.5 9.4 ...
##  $ serum_sodium            : num [1:299] 130 136 129 137 116 132 137 131 138 133 ...
##  $ sex                     : num [1:299] 1 1 1 1 0 1 1 1 0 1 ...
##  $ smoking                 : num [1:299] 0 0 1 0 0 1 0 1 0 1 ...
##  $ time                    : num [1:299] 4 6 7 7 8 8 10 10 10 10 ...
##  $ DEATH_EVENT             : num [1:299] 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   age = col_double(),
##   ..   anaemia = col_double(),
##   ..   creatinine_phosphokinase = col_double(),
##   ..   diabetes = col_double(),
##   ..   ejection_fraction = col_double(),
##   ..   high_blood_pressure = col_double(),
##   ..   platelets = col_double(),
##   ..   serum_creatinine = col_double(),
##   ..   serum_sodium = col_double(),
##   ..   sex = col_double(),
##   ..   smoking = col_double(),
##   ..   time = col_double(),
##   ..   DEATH_EVENT = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

View(hf)
#summarize the dataset
summary(hf)

##       age           anaemia       creatinine_phosphokinase    diabetes     
##  Min.   :40.00   Min.   :0.0000   Min.   :  23.0           Min.   :0.0000  
##  1st Qu.:51.00   1st Qu.:0.0000   1st Qu.: 116.5           1st Qu.:0.0000  
##  Median :60.00   Median :0.0000   Median : 250.0           Median :0.0000  
##  Mean   :60.83   Mean   :0.4314   Mean   : 581.8           Mean   :0.4181  
##  3rd Qu.:70.00   3rd Qu.:1.0000   3rd Qu.: 582.0           3rd Qu.:1.0000  
##  Max.   :95.00   Max.   :1.0000   Max.   :7861.0           Max.   :1.0000  
##  ejection_fraction high_blood_pressure   platelets      serum_creatinine
##  Min.   :14.00     Min.   :0.0000      Min.   : 25100   Min.   :0.500   
##  1st Qu.:30.00     1st Qu.:0.0000      1st Qu.:212500   1st Qu.:0.900   
##  Median :38.00     Median :0.0000      Median :262000   Median :1.100   
##  Mean   :38.08     Mean   :0.3512      Mean   :263358   Mean   :1.394   
##  3rd Qu.:45.00     3rd Qu.:1.0000      3rd Qu.:303500   3rd Qu.:1.400   
##  Max.   :80.00     Max.   :1.0000      Max.   :850000   Max.   :9.400   
##   serum_sodium        sex            smoking            time      
##  Min.   :113.0   Min.   :0.0000   Min.   :0.0000   Min.   :  4.0  
##  1st Qu.:134.0   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.: 73.0  
##  Median :137.0   Median :1.0000   Median :0.0000   Median :115.0  
##  Mean   :136.6   Mean   :0.6488   Mean   :0.3211   Mean   :130.3  
##  3rd Qu.:140.0   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:203.0  
##  Max.   :148.0   Max.   :1.0000   Max.   :1.0000   Max.   :285.0  
##   DEATH_EVENT    
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.3211  
##  3rd Qu.:1.0000  
##  Max.   :1.0000

#display first 20 rows of data
head(hf, n=20)

## # A tibble: 20 × 13
##      age anaemia creatinine_phosphokinase diabetes ejection_fraction
##    <dbl>   <dbl>                    <dbl>    <dbl>             <dbl>
##  1    75       0                      582        0                20
##  2    55       0                     7861        0                38
##  3    65       0                      146        0                20
##  4    50       1                      111        0                20
##  5    65       1                      160        1                20
##  6    90       1                       47        0                40
##  7    75       1                      246        0                15
##  8    60       1                      315        1                60
##  9    65       0                      157        0                65
## 10    80       1                      123        0                35
## 11    75       1                       81        0                38
## 12    62       0                      231        0                25
## 13    45       1                      981        0                30
## 14    50       1                      168        0                38
## 15    49       1                       80        0                30
## 16    82       1                      379        0                50
## 17    87       1                      149        0                38
## 18    45       0                      582        0                14
## 19    70       1                      125        0                25
## 20    48       1                      582        1                55
## # ℹ 8 more variables: high_blood_pressure <dbl>, platelets <dbl>,
## #   serum_creatinine <dbl>, serum_sodium <dbl>, sex <dbl>, smoking <dbl>,
## #   time <dbl>, DEATH_EVENT <dbl>

#display the dimensions of the dataset
dim(hf)

## [1] 299  13

#list types for each attribute
sapply(hf, class)

##                      age                  anaemia creatinine_phosphokinase 
##                "numeric"                "numeric"                "numeric" 
##                 diabetes        ejection_fraction      high_blood_pressure 
##                "numeric"                "numeric"                "numeric" 
##                platelets         serum_creatinine             serum_sodium 
##                "numeric"                "numeric"                "numeric" 
##                      sex                  smoking                     time 
##                "numeric"                "numeric"                "numeric" 
##              DEATH_EVENT 
##                "numeric"

# distribution of class variable
y <- hf$DEATH_EVENT
cbind(freq=table(y), percentage=prop.table(table(y))*100)

##   freq percentage
## 0  203   67.89298
## 1   96   32.10702

#calculate standard deviation for all attributes
sapply(hf[,1:13], sd)

##                      age                  anaemia creatinine_phosphokinase 
##             1.189481e+01             4.961073e-01             9.702879e+02 
##                 diabetes        ejection_fraction      high_blood_pressure 
##             4.940671e-01             1.183484e+01             4.781364e-01 
##                platelets         serum_creatinine             serum_sodium 
##             9.780424e+04             1.034510e+00             4.412477e+00 
##                      sex                  smoking                     time 
##             4.781364e-01             4.676704e-01             7.761421e+01 
##              DEATH_EVENT 
##             4.676704e-01

#calculate skewness for each variable
skew <- apply(hf[,1:13], 2, skewness)
# display skewness, larger/smaller deviations from 0 show more skew
print(skew)

##                      age                  anaemia creatinine_phosphokinase 
##                0.4188266                0.2754750                4.4184296 
##                 diabetes        ejection_fraction      high_blood_pressure 
##                0.3305857                0.5498228                0.6204576 
##                platelets         serum_creatinine             serum_sodium 
##                1.4476814                4.4113866               -1.0376430 
##                      sex                  smoking                     time 
##               -0.6204576                0.7626368                0.1265232 
##              DEATH_EVENT 
##                0.7626368

#calculate a correlation matrix for numeric variables
correlations <- cor(hf[,1:13])
#display the correlation matrix
print(correlations)

##                                  age     anaemia creatinine_phosphokinase
## age                       1.00000000  0.08800644             -0.081583900
## anaemia                   0.08800644  1.00000000             -0.190741030
## creatinine_phosphokinase -0.08158390 -0.19074103              1.000000000
## diabetes                 -0.10101239 -0.01272905             -0.009638514
## ejection_fraction         0.06009836  0.03155697             -0.044079554
## high_blood_pressure       0.09328868  0.03818200             -0.070589980
## platelets                -0.05235437 -0.04378555              0.024463389
## serum_creatinine          0.15918713  0.05217360             -0.016408480
## serum_sodium             -0.04596584  0.04188161              0.059550156
## sex                       0.06542952 -0.09476896              0.079790629
## smoking                   0.01866787 -0.10728984              0.002421235
## time                     -0.22406842 -0.14141398             -0.009345653
## DEATH_EVENT               0.25372854  0.06627010              0.062728160
##                              diabetes ejection_fraction high_blood_pressure
## age                      -0.101012385        0.06009836         0.093288685
## anaemia                  -0.012729046        0.03155697         0.038182003
## creatinine_phosphokinase -0.009638514       -0.04407955        -0.070589980
## diabetes                  1.000000000       -0.00485031        -0.012732382
## ejection_fraction        -0.004850310        1.00000000         0.024444731
## high_blood_pressure      -0.012732382        0.02444473         1.000000000
## platelets                 0.092192828        0.07217747         0.049963481
## serum_creatinine         -0.046975315       -0.01130247        -0.004934525
## serum_sodium             -0.089550619        0.17590228         0.037109470
## sex                      -0.157729504       -0.14838597        -0.104614629
## smoking                  -0.147173413       -0.06731457        -0.055711369
## time                      0.033725509        0.04172924        -0.196439479
## DEATH_EVENT              -0.001942883       -0.26860331         0.079351058
##                            platelets serum_creatinine serum_sodium          sex
## age                      -0.05235437      0.159187133 -0.045965841  0.065429524
## anaemia                  -0.04378555      0.052173604  0.041881610 -0.094768961
## creatinine_phosphokinase  0.02446339     -0.016408480  0.059550156  0.079790629
## diabetes                  0.09219283     -0.046975315 -0.089550619 -0.157729504
## ejection_fraction         0.07217747     -0.011302475  0.175902282 -0.148385965
## high_blood_pressure       0.04996348     -0.004934525  0.037109470 -0.104614629
## platelets                 1.00000000     -0.041198077  0.062124619 -0.125120483
## serum_creatinine         -0.04119808      1.000000000 -0.189095210  0.006969778
## serum_sodium              0.06212462     -0.189095210  1.000000000 -0.027566123
## sex                      -0.12512048      0.006969778 -0.027566123  1.000000000
## smoking                   0.02823445     -0.027414135  0.004813195  0.445891712
## time                      0.01051391     -0.149315418  0.087640000 -0.015608220
## DEATH_EVENT              -0.04913887      0.294277561 -0.195203596 -0.004316376
##                               smoking         time  DEATH_EVENT
## age                       0.018667868 -0.224068420  0.253728543
## anaemia                  -0.107289838 -0.141413982  0.066270098
## creatinine_phosphokinase  0.002421235 -0.009345653  0.062728160
## diabetes                 -0.147173413  0.033725509 -0.001942883
## ejection_fraction        -0.067314567  0.041729235 -0.268603312
## high_blood_pressure      -0.055711369 -0.196439479  0.079351058
## platelets                 0.028234448  0.010513909 -0.049138868
## serum_creatinine         -0.027414135 -0.149315418  0.294277561
## serum_sodium              0.004813195  0.087640000 -0.195203596
## sex                       0.445891712 -0.015608220 -0.004316376
## smoking                   1.000000000 -0.022838942 -0.012623153
## time                     -0.022838942  1.000000000 -0.526963779
## DEATH_EVENT              -0.012623153 -0.526963779  1.000000000

#create correlation plot
corrplot(correlations, method="circle")

#calculate proportion of death events
death_proportion <- mean(hf$DEATH_EVENT)
#print the proportion
print(death_proportion)

## [1] 0.3210702

#Removing null values
any(is.na(hf))

## [1] FALSE

hf<-na.omit(hf)
str(hf)

## tibble [299 × 13] (S3: tbl_df/tbl/data.frame)
##  $ age                     : num [1:299] 75 55 65 50 65 90 75 60 65 80 ...
##  $ anaemia                 : num [1:299] 0 0 0 1 1 1 1 1 0 1 ...
##  $ creatinine_phosphokinase: num [1:299] 582 7861 146 111 160 ...
##  $ diabetes                : num [1:299] 0 0 0 0 1 0 0 1 0 0 ...
##  $ ejection_fraction       : num [1:299] 20 38 20 20 20 40 15 60 65 35 ...
##  $ high_blood_pressure     : num [1:299] 1 0 0 0 0 1 0 0 0 1 ...
##  $ platelets               : num [1:299] 265000 263358 162000 210000 327000 ...
##  $ serum_creatinine        : num [1:299] 1.9 1.1 1.3 1.9 2.7 2.1 1.2 1.1 1.5 9.4 ...
##  $ serum_sodium            : num [1:299] 130 136 129 137 116 132 137 131 138 133 ...
##  $ sex                     : num [1:299] 1 1 1 1 0 1 1 1 0 1 ...
##  $ smoking                 : num [1:299] 0 0 1 0 0 1 0 1 0 1 ...
##  $ time                    : num [1:299] 4 6 7 7 8 8 10 10 10 10 ...
##  $ DEATH_EVENT             : num [1:299] 1 1 1 1 1 1 1 1 1 1 ...

View(hf)

#Scale Data
#The scale transform calculates the standard deviation for an attribute and divides each value by that standard deviation
#summarize data
summary(hf[,1:13])

##       age           anaemia       creatinine_phosphokinase    diabetes     
##  Min.   :40.00   Min.   :0.0000   Min.   :  23.0           Min.   :0.0000  
##  1st Qu.:51.00   1st Qu.:0.0000   1st Qu.: 116.5           1st Qu.:0.0000  
##  Median :60.00   Median :0.0000   Median : 250.0           Median :0.0000  
##  Mean   :60.83   Mean   :0.4314   Mean   : 581.8           Mean   :0.4181  
##  3rd Qu.:70.00   3rd Qu.:1.0000   3rd Qu.: 582.0           3rd Qu.:1.0000  
##  Max.   :95.00   Max.   :1.0000   Max.   :7861.0           Max.   :1.0000  
##  ejection_fraction high_blood_pressure   platelets      serum_creatinine
##  Min.   :14.00     Min.   :0.0000      Min.   : 25100   Min.   :0.500   
##  1st Qu.:30.00     1st Qu.:0.0000      1st Qu.:212500   1st Qu.:0.900   
##  Median :38.00     Median :0.0000      Median :262000   Median :1.100   
##  Mean   :38.08     Mean   :0.3512      Mean   :263358   Mean   :1.394   
##  3rd Qu.:45.00     3rd Qu.:1.0000      3rd Qu.:303500   3rd Qu.:1.400   
##  Max.   :80.00     Max.   :1.0000      Max.   :850000   Max.   :9.400   
##   serum_sodium        sex            smoking            time      
##  Min.   :113.0   Min.   :0.0000   Min.   :0.0000   Min.   :  4.0  
##  1st Qu.:134.0   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.: 73.0  
##  Median :137.0   Median :1.0000   Median :0.0000   Median :115.0  
##  Mean   :136.6   Mean   :0.6488   Mean   :0.3211   Mean   :130.3  
##  3rd Qu.:140.0   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:203.0  
##  Max.   :148.0   Max.   :1.0000   Max.   :1.0000   Max.   :285.0  
##   DEATH_EVENT    
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.3211  
##  3rd Qu.:1.0000  
##  Max.   :1.0000

#calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(hf[,1:13], method=c("scale"))
#summarize transform parameters
print(preprocessParams)

## Created from 299 samples and 13 variables
## 
## Pre-processing:
##   - ignored (0)
##   - scaled (13)

#transform the dataset using the parameters
transformed <- predict(preprocessParams, hf[,1:13])
#summarize the transformed dataset
summary(transformed)

##       age           anaemia       creatinine_phosphokinase    diabetes     
##  Min.   :3.363   Min.   :0.0000   Min.   :0.0237           Min.   :0.0000  
##  1st Qu.:4.288   1st Qu.:0.0000   1st Qu.:0.1201           1st Qu.:0.0000  
##  Median :5.044   Median :0.0000   Median :0.2577           Median :0.0000  
##  Mean   :5.114   Mean   :0.8696   Mean   :0.5997           Mean   :0.8462  
##  3rd Qu.:5.885   3rd Qu.:2.0157   3rd Qu.:0.5998           3rd Qu.:2.0240  
##  Max.   :7.987   Max.   :2.0157   Max.   :8.1017           Max.   :2.0240  
##  ejection_fraction high_blood_pressure   platelets      serum_creatinine
##  Min.   :1.183     Min.   :0.0000      Min.   :0.2566   Min.   :0.4833  
##  1st Qu.:2.535     1st Qu.:0.0000      1st Qu.:2.1727   1st Qu.:0.8700  
##  Median :3.211     Median :0.0000      Median :2.6788   Median :1.0633  
##  Mean   :3.218     Mean   :0.7345      Mean   :2.6927   Mean   :1.3474  
##  3rd Qu.:3.802     3rd Qu.:2.0915      3rd Qu.:3.1031   3rd Qu.:1.3533  
##  Max.   :6.760     Max.   :2.0915      Max.   :8.6908   Max.   :9.0864  
##   serum_sodium        sex           smoking            time        
##  Min.   :25.61   Min.   :0.000   Min.   :0.0000   Min.   :0.05154  
##  1st Qu.:30.37   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.94055  
##  Median :31.05   Median :2.091   Median :0.0000   Median :1.48169  
##  Mean   :30.96   Mean   :1.357   Mean   :0.6865   Mean   :1.67831  
##  3rd Qu.:31.73   3rd Qu.:2.091   3rd Qu.:2.1383   3rd Qu.:2.61550  
##  Max.   :33.54   Max.   :2.091   Max.   :2.1383   Max.   :3.67201  
##   DEATH_EVENT    
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.6865  
##  3rd Qu.:2.1383  
##  Max.   :2.1383

#Center Data
#The center transform calculates the mean for an attribute and subtracts it from each value.
#calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(hf[,1:13], method=c("center"))
#summarize transform parameters
print(preprocessParams)

## Created from 299 samples and 13 variables
## 
## Pre-processing:
##   - centered (13)
##   - ignored (0)

#transform the dataset using the parameters
transformed <- predict(preprocessParams, hf[,1:13])
#summarize the transformed dataset
summary(transformed)

##       age              anaemia        creatinine_phosphokinase
##  Min.   :-20.8339   Min.   :-0.4314   Min.   :-558.839        
##  1st Qu.: -9.8339   1st Qu.:-0.4314   1st Qu.:-465.339        
##  Median : -0.8339   Median :-0.4314   Median :-331.839        
##  Mean   :  0.0000   Mean   : 0.0000   Mean   :   0.000        
##  3rd Qu.:  9.1661   3rd Qu.: 0.5686   3rd Qu.:   0.161        
##  Max.   : 34.1661   Max.   : 0.5686   Max.   :7279.161        
##     diabetes       ejection_fraction   high_blood_pressure   platelets      
##  Min.   :-0.4181   Min.   :-24.08361   Min.   :-0.3512     Min.   :-238258  
##  1st Qu.:-0.4181   1st Qu.: -8.08361   1st Qu.:-0.3512     1st Qu.: -50858  
##  Median :-0.4181   Median : -0.08361   Median :-0.3512     Median :  -1358  
##  Mean   : 0.0000   Mean   :  0.00000   Mean   : 0.0000     Mean   :      0  
##  3rd Qu.: 0.5819   3rd Qu.:  6.91639   3rd Qu.: 0.6488     3rd Qu.:  40142  
##  Max.   : 0.5819   Max.   : 41.91639   Max.   : 0.6488     Max.   : 586642  
##  serum_creatinine    serum_sodium           sex             smoking       
##  Min.   :-0.89388   Min.   :-23.6254   Min.   :-0.6488   Min.   :-0.3211  
##  1st Qu.:-0.49388   1st Qu.: -2.6254   1st Qu.:-0.6488   1st Qu.:-0.3211  
##  Median :-0.29388   Median :  0.3746   Median : 0.3512   Median :-0.3211  
##  Mean   : 0.00000   Mean   :  0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.00612   3rd Qu.:  3.3746   3rd Qu.: 0.3512   3rd Qu.: 0.6789  
##  Max.   : 8.00612   Max.   : 11.3746   Max.   : 0.3512   Max.   : 0.6789  
##       time          DEATH_EVENT     
##  Min.   :-126.26   Min.   :-0.3211  
##  1st Qu.: -57.26   1st Qu.:-0.3211  
##  Median : -15.26   Median :-0.3211  
##  Mean   :   0.00   Mean   : 0.0000  
##  3rd Qu.:  72.74   3rd Qu.: 0.6789  
##  Max.   : 154.74   Max.   : 0.6789

#Standardize Data
#calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(hf[,1:13], method=c("center", "scale"))
#summarize transform parameters
print(preprocessParams)

## Created from 299 samples and 13 variables
## 
## Pre-processing:
##   - centered (13)
##   - ignored (0)
##   - scaled (13)

#transform the dataset using the parameters
transformed <- predict(preprocessParams, hf[,1:13])
#summarize the transformed dataset
summary(transformed)

##       age              anaemia        creatinine_phosphokinase
##  Min.   :-1.75151   Min.   :-0.8696   Min.   :-0.575952       
##  1st Qu.:-0.82674   1st Qu.:-0.8696   1st Qu.:-0.479589       
##  Median :-0.07011   Median :-0.8696   Median :-0.342001       
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.000000       
##  3rd Qu.: 0.77060   3rd Qu.: 1.1460   3rd Qu.: 0.000165       
##  Max.   : 2.87235   Max.   : 1.1460   Max.   : 7.502063       
##     diabetes       ejection_fraction   high_blood_pressure   platelets       
##  Min.   :-0.8462   Min.   :-2.034976   Min.   :-0.7345     Min.   :-2.43607  
##  1st Qu.:-0.8462   1st Qu.:-0.683035   1st Qu.:-0.7345     1st Qu.:-0.52000  
##  Median :-0.8462   Median :-0.007065   Median :-0.7345     Median :-0.01388  
##  Mean   : 0.0000   Mean   : 0.000000   Mean   : 0.0000     Mean   : 0.00000  
##  3rd Qu.: 1.1779   3rd Qu.: 0.584409   3rd Qu.: 1.3570     3rd Qu.: 0.41043  
##  Max.   : 1.1779   Max.   : 3.541779   Max.   : 1.3570     Max.   : 5.99812  
##  serum_creatinine     serum_sodium           sex             smoking       
##  Min.   :-0.864061   Min.   :-5.35423   Min.   :-1.3570   Min.   :-0.6865  
##  1st Qu.:-0.477404   1st Qu.:-0.59500   1st Qu.:-1.3570   1st Qu.:-0.6865  
##  Median :-0.284076   Median : 0.08489   Median : 0.7345   Median :-0.6865  
##  Mean   : 0.000000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.005916   3rd Qu.: 0.76478   3rd Qu.: 0.7345   3rd Qu.: 1.4517  
##  Max.   : 7.739045   Max.   : 2.57782   Max.   : 0.7345   Max.   : 1.4517  
##       time          DEATH_EVENT     
##  Min.   :-1.6268   Min.   :-0.6865  
##  1st Qu.:-0.7378   1st Qu.:-0.6865  
##  Median :-0.1966   Median :-0.6865  
##  Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.9372   3rd Qu.: 1.4517  
##  Max.   : 1.9937   Max.   : 1.4517

#Normalize Data
#calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(hf[,1:13], method=c("range"))
#summarize transform parameters
print(preprocessParams)

## Created from 299 samples and 13 variables
## 
## Pre-processing:
##   - ignored (0)
##   - re-scaling to [0, 1] (13)

#transform the dataset using the parameters
transformed <- predict(preprocessParams, hf[,1:13])
#summarize the transformed dataset
summary(transformed)

##       age            anaemia       creatinine_phosphokinase    diabetes     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000          Min.   :0.0000  
##  1st Qu.:0.2000   1st Qu.:0.0000   1st Qu.:0.01193          1st Qu.:0.0000  
##  Median :0.3636   Median :0.0000   Median :0.02896          Median :0.0000  
##  Mean   :0.3788   Mean   :0.4314   Mean   :0.07130          Mean   :0.4181  
##  3rd Qu.:0.5455   3rd Qu.:1.0000   3rd Qu.:0.07132          3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000          Max.   :1.0000  
##  ejection_fraction high_blood_pressure   platelets      serum_creatinine 
##  Min.   :0.0000    Min.   :0.0000      Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.2424    1st Qu.:0.0000      1st Qu.:0.2272   1st Qu.:0.04494  
##  Median :0.3636    Median :0.0000      Median :0.2872   Median :0.06742  
##  Mean   :0.3649    Mean   :0.3512      Mean   :0.2888   Mean   :0.10044  
##  3rd Qu.:0.4697    3rd Qu.:1.0000      3rd Qu.:0.3375   3rd Qu.:0.10112  
##  Max.   :1.0000    Max.   :1.0000      Max.   :1.0000   Max.   :1.00000  
##   serum_sodium         sex            smoking            time       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.6000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.2456  
##  Median :0.6857   Median :1.0000   Median :0.0000   Median :0.3950  
##  Mean   :0.6750   Mean   :0.6488   Mean   :0.3211   Mean   :0.4493  
##  3rd Qu.:0.7714   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.7082  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##   DEATH_EVENT    
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.3211  
##  3rd Qu.:1.0000  
##  Max.   :1.0000

#REGRESSION TREE
#set seed for reproducibility
set.seed(200)
#split the dataset into training and testing based on indices
train_indices <- 1:239
test_indices <- 240:299 #create training and testing datasets
training_data <- hf[train_indices, ]
testing_data <- hf[test_indices, ]

#train your learner on the training dataset and save your model in a variable  fit
fit.rg <- rpart(DEATH_EVENT ~ age + ejection_fraction + serum_creatinine + serum_sodium + time,
                data=training_data, method="class")
plot(fit.rg)

#save the plot as a PNG file
png("regression_tree.png")
plot(fit.rg)
dev.off()

## png 
##   2

#obtain a more readable plot
fancyRpartPlot(fit.rg)

#examine the tree
summary(fit.rg)

## Call:
## rpart(formula = DEATH_EVENT ~ age + ejection_fraction + serum_creatinine + 
##     serum_sodium + time, data = training_data, method = "class")
##   n= 239 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.53763441      0 1.0000000 1.0000000 0.08104682
## 2 0.05376344      1 0.4623656 0.5053763 0.06607203
## 3 0.03225806      2 0.4086022 0.5376344 0.06761359
## 4 0.01075269      3 0.3763441 0.5268817 0.06711088
## 5 0.01000000      4 0.3655914 0.5376344 0.06761359
## 
## Variable importance
##              time  serum_creatinine ejection_fraction               age 
##                62                24                 6                 6 
##      serum_sodium 
##                 2 
## 
## Node number 1: 239 observations,    complexity param=0.5376344
##   predicted class=0  expected loss=0.3891213  P(node) =1
##     class counts:   146    93
##    probabilities: 0.611 0.389 
##   left son=2 (169 obs) right son=3 (70 obs)
##   Primary splits:
##       time              < 67.5  to the right, improve=43.368150, (0 missing)
##       serum_creatinine  < 1.55  to the left,  improve=18.100590, (0 missing)
##       ejection_fraction < 22.5  to the right, improve=10.911660, (0 missing)
##       serum_sodium      < 135.5 to the right, improve= 7.373314, (0 missing)
##       age               < 79.5  to the left,  improve= 6.949012, (0 missing)
##   Surrogate splits:
##       age               < 86.5  to the left,  agree=0.732, adj=0.086, (0 split)
##       serum_creatinine  < 1.815 to the left,  agree=0.732, adj=0.086, (0 split)
##       ejection_fraction < 22.5  to the right, agree=0.724, adj=0.057, (0 split)
##       serum_sodium      < 122.5 to the right, agree=0.711, adj=0.014, (0 split)
## 
## Node number 2: 169 observations,    complexity param=0.05376344
##   predicted class=0  expected loss=0.1952663  P(node) =0.707113
##     class counts:   136    33
##    probabilities: 0.805 0.195 
##   left son=4 (138 obs) right son=5 (31 obs)
##   Primary splits:
##       serum_creatinine  < 1.55  to the left,  improve=11.276520, (0 missing)
##       ejection_fraction < 32.5  to the right, improve= 6.255623, (0 missing)
##       serum_sodium      < 135.5 to the right, improve= 4.114378, (0 missing)
##       age               < 71    to the left,  improve= 3.057750, (0 missing)
##       time              < 182.5 to the right, improve= 1.205818, (0 missing)
##   Surrogate splits:
##       serum_sodium < 124.5 to the right, agree=0.828, adj=0.065, (0 split)
## 
## Node number 3: 70 observations
##   predicted class=1  expected loss=0.1428571  P(node) =0.292887
##     class counts:    10    60
##    probabilities: 0.143 0.857 
## 
## Node number 4: 138 observations
##   predicted class=0  expected loss=0.1086957  P(node) =0.5774059
##     class counts:   123    15
##    probabilities: 0.891 0.109 
## 
## Node number 5: 31 observations,    complexity param=0.03225806
##   predicted class=1  expected loss=0.4193548  P(node) =0.1297071
##     class counts:    13    18
##    probabilities: 0.419 0.581 
##   left son=10 (7 obs) right son=11 (24 obs)
##   Primary splits:
##       serum_creatinine  < 2.85  to the right, improve=1.5729650, (0 missing)
##       time              < 109.5 to the left,  improve=1.1808080, (0 missing)
##       ejection_fraction < 32.5  to the right, improve=1.1808080, (0 missing)
##       serum_sodium      < 135.5 to the right, improve=1.0529150, (0 missing)
##       age               < 71    to the left,  improve=0.7331378, (0 missing)
##   Surrogate splits:
##       ejection_fraction < 55    to the right, agree=0.839, adj=0.286, (0 split)
## 
## Node number 10: 7 observations
##   predicted class=0  expected loss=0.2857143  P(node) =0.0292887
##     class counts:     5     2
##    probabilities: 0.714 0.286 
## 
## Node number 11: 24 observations,    complexity param=0.01075269
##   predicted class=1  expected loss=0.3333333  P(node) =0.1004184
##     class counts:     8    16
##    probabilities: 0.333 0.667 
##   left son=22 (9 obs) right son=23 (15 obs)
##   Primary splits:
##       ejection_fraction < 32.5  to the right, improve=1.4222220, (0 missing)
##       time              < 92.5  to the left,  improve=1.1204480, (0 missing)
##       serum_sodium      < 135.5 to the right, improve=0.6666667, (0 missing)
##       age               < 71    to the left,  improve=0.6095238, (0 missing)
##       serum_creatinine  < 1.815 to the left,  improve=0.3333333, (0 missing)
##   Surrogate splits:
##       age              < 74    to the right, agree=0.792, adj=0.444, (0 split)
##       serum_creatinine < 1.75  to the right, agree=0.667, adj=0.111, (0 split)
##       serum_sodium     < 141.5 to the right, agree=0.667, adj=0.111, (0 split)
##       time             < 80    to the left,  agree=0.667, adj=0.111, (0 split)
## 
## Node number 22: 9 observations
##   predicted class=0  expected loss=0.4444444  P(node) =0.0376569
##     class counts:     5     4
##    probabilities: 0.556 0.444 
## 
## Node number 23: 15 observations
##   predicted class=1  expected loss=0.2  P(node) =0.06276151
##     class counts:     3    12
##    probabilities: 0.200 0.800

#KNN
#we must split the dataset in order to evaluate how good our classification is
#divide into training dataset and test dataset
#training dataset is what we will use the to build the knn model
#test dataset will allow us to determine how well out model performs
hf_knn_train<-hf[1:239, ]
head(hf_knn_train)

## # A tibble: 6 × 13
##     age anaemia creatinine_phosphokinase diabetes ejection_fraction
##   <dbl>   <dbl>                    <dbl>    <dbl>             <dbl>
## 1    75       0                      582        0                20
## 2    55       0                     7861        0                38
## 3    65       0                      146        0                20
## 4    50       1                      111        0                20
## 5    65       1                      160        1                20
## 6    90       1                       47        0                40
## # ℹ 8 more variables: high_blood_pressure <dbl>, platelets <dbl>,
## #   serum_creatinine <dbl>, serum_sodium <dbl>, sex <dbl>, smoking <dbl>,
## #   time <dbl>, DEATH_EVENT <dbl>

hf_knn_test<-hf[240:299, ]
head(hf_knn_test)

## # A tibble: 6 × 13
##     age anaemia creatinine_phosphokinase diabetes ejection_fraction
##   <dbl>   <dbl>                    <dbl>    <dbl>             <dbl>
## 1    55       1                      180        0                45
## 2    70       0                       81        1                35
## 3    65       0                      582        1                30
## 4    40       0                       90        0                35
## 5    73       1                     1185        0                40
## 6    54       0                      582        1                38
## # ℹ 8 more variables: high_blood_pressure <dbl>, platelets <dbl>,
## #   serum_creatinine <dbl>, serum_sodium <dbl>, sex <dbl>, smoking <dbl>,
## #   time <dbl>, DEATH_EVENT <dbl>

#the death event label has been excluded from the training and tests datasets
#but we need it for training the knn model, 
#So we will add two commands to store those values into 2 vectors
hf_knn_train_labels<-hf[1:239, 13]
hf_knn_test_labels<-hf[240:299, 13]
head(hf_knn_test_labels)

## # A tibble: 6 × 1
##   DEATH_EVENT
##         <dbl>
## 1           0
## 2           0
## 3           0
## 4           0
## 5           0
## 6           0

#rule of thumb to take sqrt of training dataset to determine class, k value
sqrt(239)

## [1] 15.45962

#Begin training a model on the data
#to classify our test instances, we will use the knn implementation from the class
#package, which provides a set of basic R functions for classifications
hf_knn_test_pred<-knn(train=hf_knn_train, test=hf_knn_test, 
                      cl=hf_knn_train_labels$DEATH_EVENT, k=15)
hf_knn_test_pred

##  [1] 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 1 0
## [39] 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0
## Levels: 0 1

#test model; evaluate how well the predicted classes are
#create a cross tabulation of predicted vs. actual
CrossTable(x=hf_knn_test_labels$DEATH_EVENT, y=hf_knn_test_pred, prop.chisq=FALSE)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  60 
## 
##  
##                                | hf_knn_test_pred 
## hf_knn_test_labels$DEATH_EVENT |         0 |         1 | Row Total | 
## -------------------------------|-----------|-----------|-----------|
##                              0 |        47 |        10 |        57 | 
##                                |     0.825 |     0.175 |     0.950 | 
##                                |     0.959 |     0.909 |           | 
##                                |     0.783 |     0.167 |           | 
## -------------------------------|-----------|-----------|-----------|
##                              1 |         2 |         1 |         3 | 
##                                |     0.667 |     0.333 |     0.050 | 
##                                |     0.041 |     0.091 |           | 
##                                |     0.033 |     0.017 |           | 
## -------------------------------|-----------|-----------|-----------|
##                   Column Total |        49 |        11 |        60 | 
##                                |     0.817 |     0.183 |           | 
## -------------------------------|-----------|-----------|-----------|
## 
##

#this is called a confusion matrix, we can see that we have 47 TN and 1 TP
#we have 10 FN and 2 FP


#SVM
#divide into train and test data

heartfailure_train<-heartFailure[1:239, ]
heartfailure_test<-heartFailure[240:299, ]
heartfailure_train$DEATH_EVENT <- factor(heartfailure_train$DEATH_EVENT)
#svm kernel
heartfailure_classifier<- ksvm(DEATH_EVENT~ ., data=heartfailure_train, kernel="vanilladot")

##  Setting default kernel parameters

heartfailure_classifier

## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1 
## 
## Linear (vanilla) kernel function. 
## 
## Number of Support Vectors : 115 
## 
## Objective Function Value : -108.2006 
## Training error : 0.171548

hf_predict<-predict(heartfailure_classifier, heartfailure_test)
hf_predict

##  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [39] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Levels: 0 1

head(hf_predict)

## [1] 0 0 0 0 0 0
## Levels: 0 1

#accuracy kernel
agreement<-hf_predict==heartfailure_test$DEATH_EVENT
agreement

##  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE
## [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
## [25]  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

table(agreement)

## agreement
## FALSE  TRUE 
##     3    57

prop.table(table(agreement))

## agreement
## FALSE  TRUE 
##  0.05  0.95

table(hf_predict,heartfailure_test$DEATH_EVENT)

##           
## hf_predict  0  1
##          0 57  3
##          1  0  0

#svm rbf
heartfailure_classifier_rbf<- ksvm(DEATH_EVENT~ ., data=heartfailure_train, kernel="rbfdot")
heartfailure_classifier_rbf

## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.0562980380456991 
## 
## Number of Support Vectors : 152 
## 
## Objective Function Value : -108.7988 
## Training error : 0.133891

hf_predict_rbf<-predict(heartfailure_classifier_rbf, heartfailure_test)
hf_predict_rbf

##  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [39] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## Levels: 0 1

head(hf_predict_rbf)

## [1] 0 0 0 0 0 0
## Levels: 0 1

table(hf_predict_rbf,heartfailure_test$DEATH_EVENT)

##               
## hf_predict_rbf  0  1
##              0 56  3
##              1  1  0

#calculate accuracy
agreement_rbf<-hf_predict_rbf==heartfailure_test$DEATH_EVENT
agreement_rbf

##  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE
## [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
## [25]  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE

table(agreement_rbf)

## agreement_rbf
## FALSE  TRUE 
##     4    56

prop.table(table(agreement_rbf))

## agreement_rbf
##      FALSE       TRUE 
## 0.06666667 0.93333333

#DECISION TREE
# Convert DEATH_EVENT to a factor in the training dataset
heartfailure_train$DEATH_EVENT <- factor(heartfailure_train$DEATH_EVENT)
str(heartfailure_train) #ensure death event was converted to factor

## tibble [239 × 13] (S3: tbl_df/tbl/data.frame)
##  $ age                     : num [1:239] 75 55 65 50 65 90 75 60 65 80 ...
##  $ anaemia                 : num [1:239] 0 0 0 1 1 1 1 1 0 1 ...
##  $ creatinine_phosphokinase: num [1:239] 582 7861 146 111 160 ...
##  $ diabetes                : num [1:239] 0 0 0 0 1 0 0 1 0 0 ...
##  $ ejection_fraction       : num [1:239] 20 38 20 20 20 40 15 60 65 35 ...
##  $ high_blood_pressure     : num [1:239] 1 0 0 0 0 1 0 0 0 1 ...
##  $ platelets               : num [1:239] 265000 263358 162000 210000 327000 ...
##  $ serum_creatinine        : num [1:239] 1.9 1.1 1.3 1.9 2.7 2.1 1.2 1.1 1.5 9.4 ...
##  $ serum_sodium            : num [1:239] 130 136 129 137 116 132 137 131 138 133 ...
##  $ sex                     : num [1:239] 1 1 1 1 0 1 1 1 0 1 ...
##  $ smoking                 : num [1:239] 0 0 1 0 0 1 0 1 0 1 ...
##  $ time                    : num [1:239] 4 6 7 7 8 8 10 10 10 10 ...
##  $ DEATH_EVENT             : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...

#build the C5.0 model
heartfailure_model <- C5.0(heartfailure_train[-13], heartfailure_train$DEATH_EVENT)
summary(heartfailure_model)

## 
## Call:
## C5.0.default(x = heartfailure_train[-13], y = heartfailure_train$DEATH_EVENT)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Tue May  7 18:34:17 2024
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 239 cases (13 attributes) from undefined.data
## 
## Decision tree:
## 
## time <= 67: 1 (70/10)
## time > 67:
## :...serum_creatinine > 1.5:
##     :...creatinine_phosphokinase <= 62: 0 (4)
##     :   creatinine_phosphokinase > 62:
##     :   :...high_blood_pressure > 0: 1 (8)
##     :       high_blood_pressure <= 0:
##     :       :...diabetes <= 0:
##     :           :...platelets <= 277000: 1 (7)
##     :           :   platelets > 277000: 0 (3)
##     :           diabetes > 0:
##     :           :...sex <= 0: 1 (2)
##     :               sex > 0: 0 (7/1)
##     serum_creatinine <= 1.5:
##     :...age > 79:
##         :...serum_sodium <= 135: 1 (5/1)
##         :   serum_sodium > 135: 0 (3)
##         age <= 79:
##         :...ejection_fraction > 30: 0 (99/3)
##             ejection_fraction <= 30:
##             :...time <= 78: 1 (3)
##                 time > 78:
##                 :...high_blood_pressure > 0: 0 (8)
##                     high_blood_pressure <= 0:
##                     :...diabetes <= 0: 0 (12/1)
##                         diabetes > 0:
##                         :...serum_creatinine <= 0.9: 0 (2)
##                             serum_creatinine > 0.9:
##                             :...time <= 175: 1 (4)
##                                 time > 175: 0 (2)
## 
## 
## Evaluation on training data (239 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      16   16( 6.7%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     135    11    (a): class 0
##       5    88    (b): class 1
## 
## 
##  Attribute usage:
## 
##  100.00% time
##   70.71% serum_creatinine
##   57.74% age
##   54.39% ejection_fraction
##   23.01% high_blood_pressure
##   16.32% diabetes
##   12.97% creatinine_phosphokinase
##    4.18% platelets
##    3.77% sex
##    3.35% serum_sodium
## 
## 
## Time: 0.0 secs

#predict
heartfailure_pred <- predict(heartfailure_model, heartfailure_test)
heartfailure_pred

##  [1] 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0
## [39] 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Levels: 0 1

library(gmodels)
CrossTable(heartfailure_test$DEATH_EVENT, heartfailure_pred,
           prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
           dnn = c('actual death', 'predicted death'))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  60 
## 
##  
##              | predicted death 
## actual death |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |        53 |         4 |        57 | 
##              |     0.883 |     0.067 |           | 
## -------------|-----------|-----------|-----------|
##            1 |         2 |         1 |         3 | 
##              |     0.033 |     0.017 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |        55 |         5 |        60 | 
## -------------|-----------|-----------|-----------|
## 
##

#boost and analyze outcome
heartfailure_boost <- C5.0(heartfailure_train[-13], heartfailure_train$DEATH_EVENT, trial = 10)
summary(heartfailure_boost)

## 
## Call:
## C5.0.default(x = heartfailure_train[-13], y =
##  heartfailure_train$DEATH_EVENT, trials = 10)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Tue May  7 18:34:17 2024
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 239 cases (13 attributes) from undefined.data
## 
## -----  Trial 0:  -----
## 
## Decision tree:
## 
## time <= 67: 1 (70/10)
## time > 67:
## :...serum_creatinine > 1.5:
##     :...creatinine_phosphokinase <= 62: 0 (4)
##     :   creatinine_phosphokinase > 62:
##     :   :...high_blood_pressure > 0: 1 (8)
##     :       high_blood_pressure <= 0:
##     :       :...diabetes <= 0:
##     :           :...platelets <= 277000: 1 (7)
##     :           :   platelets > 277000: 0 (3)
##     :           diabetes > 0:
##     :           :...sex <= 0: 1 (2)
##     :               sex > 0: 0 (7/1)
##     serum_creatinine <= 1.5:
##     :...age > 79:
##         :...serum_sodium <= 135: 1 (5/1)
##         :   serum_sodium > 135: 0 (3)
##         age <= 79:
##         :...ejection_fraction > 30: 0 (99/3)
##             ejection_fraction <= 30:
##             :...time <= 78: 1 (3)
##                 time > 78:
##                 :...high_blood_pressure > 0: 0 (8)
##                     high_blood_pressure <= 0:
##                     :...diabetes <= 0: 0 (12/1)
##                         diabetes > 0:
##                         :...serum_creatinine <= 0.9: 0 (2)
##                             serum_creatinine > 0.9:
##                             :...time <= 175: 1 (4)
##                                 time > 175: 0 (2)
## 
## -----  Trial 1:  -----
## 
## Decision tree:
## 
## ejection_fraction <= 20: 1 (20.4/2.3)
## ejection_fraction > 20:
## :...time > 50:
##     :...ejection_fraction <= 30:
##     :   :...anaemia <= 0: 0 (18.4/5.4)
##     :   :   anaemia > 0: 1 (18.8/6.9)
##     :   ejection_fraction > 30:
##     :   :...platelets <= 153000: 1 (14.6/5.4)
##     :       platelets > 153000: 0 (109.9/12.7)
##     time <= 50:
##     :...smoking > 0: 1 (10.8)
##         smoking <= 0:
##         :...serum_sodium > 139: 1 (6.1)
##             serum_sodium <= 139:
##             :...age <= 73: 0 (34.6/9.2)
##                 age > 73: 1 (5.4)
## 
## -----  Trial 2:  -----
## 
## Decision tree:
## 
## time <= 11: 1 (12.3)
## time > 11:
## :...serum_creatinine > 1.5:
##     :...serum_sodium <= 135:
##     :   :...creatinine_phosphokinase <= 607: 1 (30.6/1.9)
##     :   :   creatinine_phosphokinase > 607: 0 (2.8)
##     :   serum_sodium > 135:
##     :   :...sex <= 0: 1 (7.4/1.9)
##     :       sex > 0: 0 (13/2.8)
##     serum_creatinine <= 1.5:
##     :...time > 172: 0 (31.2)
##         time <= 172:
##         :...time > 148: 1 (10.2/0.6)
##             time <= 148:
##             :...age > 67:
##                 :...time <= 115: 1 (30.6/8.3)
##                 :   time > 115: 0 (7.7/1.5)
##                 age <= 67:
##                 :...creatinine_phosphokinase > 2281: 1 (7.7/1.2)
##                     creatinine_phosphokinase <= 2281:
##                     :...serum_sodium <= 131: 1 (4.3/0.6)
##                         serum_sodium > 131:
##                         :...time > 78: 0 (44.1)
##                             time <= 78:
##                             :...serum_sodium <= 140: 0 (34.3/4.9)
##                                 serum_sodium > 140: 1 (2.8)
## 
## -----  Trial 3:  -----
## 
## Decision tree:
## 
## ejection_fraction <= 20:
## :...time <= 140: 1 (17)
## :   time > 140: 0 (4.6/1)
## ejection_fraction > 20:
## :...time > 50:
##     :...serum_creatinine <= 1.18:
##     :   :...smoking > 0: 0 (38.4/2.2)
##     :   :   smoking <= 0:
##     :   :   :...creatinine_phosphokinase > 2281: 1 (6.2/1.2)
##     :   :       creatinine_phosphokinase <= 2281:
##     :   :       :...serum_creatinine <= 0.6: 1 (2.6)
##     :   :           serum_creatinine > 0.6: 0 (45.1/2.9)
##     :   serum_creatinine > 1.18:
##     :   :...creatinine_phosphokinase <= 75: 0 (14.9)
##     :       creatinine_phosphokinase > 75:
##     :       :...sex <= 0: 1 (20.9/4.8)
##     :           sex > 0:
##     :           :...diabetes <= 0: 1 (19.7/7.2)
##     :               diabetes > 0: 0 (14.6/2.9)
##     time <= 50:
##     :...smoking > 0: 1 (11.8)
##         smoking <= 0:
##         :...serum_sodium > 139: 1 (7.2)
##             serum_sodium <= 139:
##             :...serum_sodium <= 133: 1 (4.6)
##                 serum_sodium > 133:
##                 :...time <= 11: 1 (4.8)
##                     time > 11:
##                     :...time <= 38: 0 (24.2/6)
##                         time > 38: 1 (2.4)
## 
## -----  Trial 4:  -----
## 
## Decision tree:
## 
## time > 73:
## :...serum_creatinine > 1.4:
## :   :...ejection_fraction <= 45: 1 (30.4/10.8)
## :   :   ejection_fraction > 45: 0 (9.5/0.4)
## :   serum_creatinine <= 1.4:
## :   :...time > 172: 0 (28.9)
## :       time <= 172:
## :       :...time <= 148: 0 (71.3/13.8)
## :           time > 148: 1 (10.2/1.7)
## time <= 73:
## :...age > 73: 1 (17.5)
##     age <= 73:
##     :...ejection_fraction <= 20: 1 (9.8)
##         ejection_fraction > 20:
##         :...serum_sodium > 139: 1 (11.3/0.4)
##             serum_sodium <= 139:
##             :...serum_sodium > 136: 0 (25.2/6.8)
##                 serum_sodium <= 136:
##                 :...age <= 70: 1 (21.1/2.1)
##                     age > 70: 0 (3.9)
## 
## -----  Trial 5:  -----
## 
## Decision tree:
## 
## time <= 67:
## :...platelets <= 213000: 1 (20.3)
## :   platelets > 213000:
## :   :...platelets <= 224000: 0 (7.1/0.7)
## :       platelets > 224000:
## :       :...creatinine_phosphokinase <= 109: 0 (12.6/4.1)
## :           creatinine_phosphokinase > 109: 1 (38.4/4.9)
## time > 67:
## :...ejection_fraction > 38:
##     :...diabetes > 0: 0 (14.9)
##     :   diabetes <= 0:
##     :   :...creatinine_phosphokinase <= 124: 0 (15.6)
##     :       creatinine_phosphokinase > 124:
##     :       :...serum_creatinine > 1.6: 1 (5.2)
##     :           serum_creatinine <= 1.6:
##     :           :...creatinine_phosphokinase <= 145: 1 (7.5/0.6)
##     :               creatinine_phosphokinase > 145: 0 (24.3/1.6)
##     ejection_fraction <= 38:
##     :...platelets <= 87000: 1 (5.9)
##         platelets > 87000:
##         :...serum_creatinine > 2.9: 0 (6.8)
##             serum_creatinine <= 2.9:
##             :...ejection_fraction > 35: 1 (13.9/4.6)
##                 ejection_fraction <= 35:
##                 :...time <= 78: 1 (8/0.7)
##                     time > 78:
##                     :...ejection_fraction > 30: 0 (17.5)
##                         ejection_fraction <= 30:
##                         :...time <= 110: 0 (15.3/0.9)
##                             time > 110:
##                             :...time <= 198: 1 (21.4/6.9)
##                                 time > 198: 0 (4.2)
## 
## -----  Trial 6:  -----
## 
## Decision tree:
## 
## time <= 67:
## :...age > 73: 1 (13.9)
## :   age <= 73:
## :   :...high_blood_pressure > 0: 1 (28.7/5)
## :       high_blood_pressure <= 0:
## :       :...time <= 16: 1 (6)
## :           time > 16:
## :           :...serum_sodium > 139: 1 (5)
## :               serum_sodium <= 139:
## :               :...serum_sodium <= 136: 1 (9.2/2.7)
## :                   serum_sodium > 136: 0 (12.9)
## time > 67:
## :...sex <= 0:
##     :...creatinine_phosphokinase <= 122: 0 (16.2/0.2)
##     :   creatinine_phosphokinase > 122:
##     :   :...serum_creatinine > 1.2: 1 (16.1/2.3)
##     :       serum_creatinine <= 1.2:
##     :       :...platelets <= 153000: 1 (6.4/0.6)
##     :           platelets > 153000: 0 (19.1/2.5)
##     sex > 0:
##     :...high_blood_pressure > 0: 0 (26.2/2.8)
##         high_blood_pressure <= 0:
##         :...age <= 55: 0 (25.7/1.9)
##             age > 55:
##             :...ejection_fraction <= 20: 1 (3.7)
##                 ejection_fraction > 20:
##                 :...platelets <= 201000: 0 (14.7)
##                     platelets > 201000:
##                     :...platelets <= 228000: 1 (6.4)
##                         platelets > 228000: 0 (28.7/9.2)
## 
## -----  Trial 7:  -----
## 
## Decision tree:
## 
## time > 180: 0 (29.9/2.6)
## time <= 180:
## :...serum_creatinine <= 0.9: 0 (38.4/8.4)
##     serum_creatinine > 0.9:
##     :...ejection_fraction <= 20: 1 (11.6)
##         ejection_fraction > 20:
##         :...age > 73:
##             :...serum_sodium <= 132: 1 (11.7)
##             :   serum_sodium > 132:
##             :   :...creatinine_phosphokinase > 855: 0 (3.6)
##             :       creatinine_phosphokinase <= 855:
##             :       :...platelets <= 334000: 1 (22.2/2.5)
##             :           platelets > 334000: 0 (3.8/0.2)
##             age <= 73:
##             :...platelets > 328000:
##                 :...time <= 20: 0 (4/0.2)
##                 :   time > 20: 1 (24.5/1.7)
##                 platelets <= 328000:
##                 :...time > 82:
##                     :...time <= 148: 0 (30.8/0.9)
##                     :   time > 148: 1 (9.8/3.8)
##                     time <= 82:
##                     :...platelets <= 217000: 1 (14.4/1.1)
##                         platelets > 217000:
##                         :...serum_sodium > 139: 1 (7/0.2)
##                             serum_sodium <= 139:
##                             :...time <= 13: 1 (3.4)
##                                 time > 13: 0 (23.8/3.5)
## 
## -----  Trial 8:  -----
## 
## Decision tree:
## 
## time <= 73:
## :...serum_sodium <= 133: 1 (17.6)
## :   serum_sodium > 133:
## :   :...ejection_fraction > 45: 0 (17.5/5.9)
## :       ejection_fraction <= 45:
## :       :...serum_sodium <= 136: 1 (14.1)
## :           serum_sodium > 136:
## :           :...sex <= 0: 0 (11.5/3.6)
## :               sex > 0:
## :               :...serum_creatinine <= 2.1: 1 (21.5/3.7)
## :                   serum_creatinine > 2.1: 0 (2.6)
## time > 73:
## :...platelets <= 126000: 1 (9.6/0.7)
##     platelets > 126000:
##     :...creatinine_phosphokinase <= 72: 0 (13.7)
##         creatinine_phosphokinase > 72:
##         :...serum_creatinine > 1.4: 1 (36.8/12)
##             serum_creatinine <= 1.4:
##             :...time > 172: 0 (20.5)
##                 time <= 172:
##                 :...age > 77: 1 (12.5/2.8)
##                     age <= 77:
##                     :...time > 148: 1 (11.6/3.2)
##                         time <= 148:
##                         :...platelets <= 300000: 0 (27.5)
##                             platelets > 300000:
##                             :...smoking <= 0: 1 (13.9/4.4)
##                                 smoking > 0: 0 (8.1)
## 
## -----  Trial 9:  -----
## 
## Decision tree:
## 
## time <= 50:
## :...smoking > 0: 1 (13.8)
## :   smoking <= 0:
## :   :...serum_sodium > 138: 1 (12.6)
## :       serum_sodium <= 138:
## :       :...platelets <= 362000: 1 (33.3/8.4)
## :           platelets > 362000: 0 (5.8/0.1)
## time > 50:
## :...ejection_fraction > 30:
##     :...serum_sodium > 135: 0 (64.7/7.9)
##     :   serum_sodium <= 135:
##     :   :...anaemia > 0: 0 (14.5)
##     :       anaemia <= 0:
##     :       :...time <= 172: 1 (23/3.7)
##     :           time > 172: 0 (7.3)
##     ejection_fraction <= 30:
##     :...time <= 78: 1 (11.5)
##         time > 78:
##         :...serum_sodium > 139: 0 (12.5/0.3)
##             serum_sodium <= 139:
##             :...serum_creatinine <= 1: 0 (9.7/0.5)
##                 serum_creatinine > 1:
##                 :...time <= 109: 0 (9.3/2.4)
##                     time > 109: 1 (21.1/3.2)
## 
## 
## Evaluation on training data (239 cases):
## 
## Trial        Decision Tree   
## -----      ----------------  
##    Size      Errors  
## 
##    0     16   16( 6.7%)
##    1      9   50(20.9%)
##    2     14   31(13.0%)
##    3     16   34(14.2%)
##    4     11   29(12.1%)
##    5     17   34(14.2%)
##    6     16   18( 7.5%)
##    7     15   36(15.1%)
##    8     15   43(18.0%)
##    9     13   27(11.3%)
## boost              0( 0.0%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     146          (a): class 0
##            93    (b): class 1
## 
## 
##  Attribute usage:
## 
##  100.00% ejection_fraction
##  100.00% platelets
##  100.00% serum_creatinine
##  100.00% serum_sodium
##  100.00% time
##   97.07% age
##   95.40% creatinine_phosphokinase
##   83.68% sex
##   75.31% high_blood_pressure
##   70.71% smoking
##   50.21% diabetes
##   33.05% anaemia
## 
## 
## Time: 0.0 secs

#Predict with boosting and view CrossTable
heartfailureboost_pred <- predict(heartfailure_boost, heartfailure_test)

CrossTable(heartfailure_test$DEATH_EVENT, heartfailureboost_pred,
           prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
           dnn = c('actual death', 'predicted death'))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  60 
## 
##  
##              | predicted death 
## actual death |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |        54 |         3 |        57 | 
##              |     0.900 |     0.050 |           | 
## -------------|-----------|-----------|-----------|
##            1 |         3 |         0 |         3 | 
##              |     0.050 |     0.000 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |        57 |         3 |        60 | 
## -------------|-----------|-----------|-----------|
## 
##

#Build error cost maxtrix to serve as feeder
matrix_dimensions <- list(c("no", "yes"), c("no", "yes"))
names(matrix_dimensions) <- c("predicted", "actual")
matrix_dimensions

## $predicted
## [1] "no"  "yes"
## 
## $actual
## [1] "no"  "yes"

# Define the cost matrix
error_cost <- matrix(c(0, 1, 4, 0), nrow = 2, dimnames = list(c("0", "1"), c("0", "1")))
error_cost

##   0 1
## 0 0 4
## 1 1 0

#run with the error cost , predict and view cost matrix
heartfailure_cost <- C5.0(heartfailure_train[-13], heartfailure_train$DEATH_EVENT,
                          costs = error_cost)

heartfailure_cost_pred <- predict(heartfailure_cost, heartfailure_test)
CrossTable(heartfailure_test$DEATH_EVENT, heartfailure_cost_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
           dnn = c('actual default', 'predicted default'))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  60 
## 
##  
##                | predicted default 
## actual default |         0 |         1 | Row Total | 
## ---------------|-----------|-----------|-----------|
##              0 |        48 |         9 |        57 | 
##                |     0.800 |     0.150 |           | 
## ---------------|-----------|-----------|-----------|
##              1 |         2 |         1 |         3 | 
##                |     0.033 |     0.017 |           | 
## ---------------|-----------|-----------|-----------|
##   Column Total |        50 |        10 |        60 | 
## ---------------|-----------|-----------|-----------|
## 
##

#Comparing performance
# 3. Evaluation
# Regression Tree Evaluation
rg_pred <- predict(fit.rg, testing_data, type = "class")
rg_accuracy <- mean(rg_pred == testing_data$DEATH_EVENT)
rg_conf_matrix <- table(Actual = testing_data$DEATH_EVENT, Predicted = rg_pred)

# KNN Evaluation
# Extract the DEATH_EVENT column from hf_knn_test_labels
hf_knn_test_labels <- hf_knn_test$DEATH_EVENT

# Now compare hf_knn_test_pred with hf_knn_test_labels
knn_accuracy <- mean(hf_knn_test_pred == hf_knn_test_labels)
knn_conf_matrix <- table(Actual = hf_knn_test_labels, Predicted = hf_knn_test_pred)


# SVM Kernel Evaluation
svm_accuracy <- mean(hf_predict == testing_data$DEATH_EVENT)
svm_conf_matrix <- table(Actual = testing_data$DEATH_EVENT, Predicted = hf_predict)

#svm rbf
# SVM Evaluation
svm_accuracy_rbf <- mean(hf_predict_rbf == testing_data$DEATH_EVENT)
svm_conf_matrix <- table(Actual = testing_data$DEATH_EVENT, Predicted = hf_predict)

# Decision Tree Evaluation
dt_accuracy <- mean(heartfailureboost_pred == testing_data$DEATH_EVENT)
dt_conf_matrix <- table(Actual = testing_data$DEATH_EVENT, Predicted = heartfailureboost_pred)

# Decision Tree Evaluation
dt_accuracy_cost <- mean(heartfailure_cost_pred == testing_data$DEATH_EVENT)

# 4. Comparison
# Print evaluation metrics for each model
cat("Regression Tree Accuracy:", rg_accuracy, "\n")

## Regression Tree Accuracy: 0.9333333

cat("KNN Accuracy:", knn_accuracy, "\n")

## KNN Accuracy: 0.8

cat("SVM kernel Accuracy:", svm_accuracy, "\n")

## SVM kernel Accuracy: 0.95

cat("SVM rbf Accuracy:", svm_accuracy_rbf, "\n")

## SVM rbf Accuracy: 0.9333333

cat("Decision Tree Accuracy:", dt_accuracy, "\n")

## Decision Tree Accuracy: 0.9

cat("Decision Tree Accuracy:", dt_accuracy_cost, "\n")

## Decision Tree Accuracy: 0.8166667

# Print confusion matrix for each model
cat("\nRegression Tree Confusion Matrix:\n")

## 
## Regression Tree Confusion Matrix:

print(rg_conf_matrix)

##       Predicted
## Actual  0  1
##      0 55  2
##      1  2  1

cat("\nKNN Confusion Matrix:\n")

## 
## KNN Confusion Matrix:

print(knn_conf_matrix)

##       Predicted
## Actual  0  1
##      0 47 10
##      1  2  1

cat("\nSVM Confusion Matrix:\n")

## 
## SVM Confusion Matrix:

print(svm_conf_matrix)

##       Predicted
## Actual  0  1
##      0 57  0
##      1  3  0

cat("\nDecision Tree Confusion Matrix:\n")

## 
## Decision Tree Confusion Matrix:

print(dt_conf_matrix)

##       Predicted
## Actual  0  1
##      0 54  3
##      1  3  0

Project 4 Final

Destinee Redfearn, Bezawit Tilahun, Jarvis Woodard, James Wilson

2024-05-07