suppressMessages(library(ggplot2))
suppressMessages(library(cowplot))
## Warning: package 'cowplot' was built under R version 3.6.3
suppressMessages(library(randomForest))
## Warning: package 'randomForest' was built under R version 3.6.3

Loading data

url <- "http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
raw.data <- read.csv(url , header = FALSE)
colnames(raw.data) <- c(
  "age",
  "sex",# 0 = female, 1 = male
  "cp", # chest pain 
          # 1 = typical angina, 
          # 2 = atypical angina, 
          # 3 = non-anginal pain, 
          # 4 = asymptomatic
  "trestbps", # resting blood pressure (in mm Hg)
  "chol", # serum cholestoral in mg/dl
  "fbs",  # fasting blood sugar if less than 120 mg/dl, 1 = TRUE, 0 = FALSE
  "restecg", # resting electrocardiographic results
          # 1 = normal
          # 2 = having ST-T wave abnormality
          # 3 = showing probable or definite left ventricular hypertrophy
  "thalach", # maximum heart rate achieved
  "exang",   # exercise induced angina, 1 = yes, 0 = no
  "oldpeak", # ST depression induced by exercise relative to rest
  "slope", # the slope of the peak exercise ST segment 
          # 1 = upsloping 
          # 2 = flat 
          # 3 = downsloping 
  "ca", # number of major vessels (0-3) colored by fluoroscopy
  "thal", # this is short of thalium heart scan
          # 3 = normal (no cold spots)
          # 6 = fixed defect (cold spots during rest and exercise)
          # 7 = reversible defect (when cold spots only appear during exercise)
  "hd" # (the predicted attribute) - diagnosis of heart disease 
          # 0 if less than or equal to 50% diameter narrowing
          # 1 if greater than 50% diameter narrowing
  )
str(raw.data)
## 'data.frame':    303 obs. of  14 variables:
##  $ age     : num  63 67 67 37 41 56 62 57 63 53 ...
##  $ sex     : num  1 1 1 1 0 1 0 0 1 1 ...
##  $ cp      : num  1 4 4 3 2 2 4 4 4 4 ...
##  $ trestbps: num  145 160 120 130 130 120 140 120 130 140 ...
##  $ chol    : num  233 286 229 250 204 236 268 354 254 203 ...
##  $ fbs     : num  1 0 0 0 0 0 0 0 0 1 ...
##  $ restecg : num  2 2 2 0 2 0 2 0 2 2 ...
##  $ thalach : num  150 108 129 187 172 178 160 163 147 155 ...
##  $ exang   : num  0 1 1 0 0 0 0 1 0 1 ...
##  $ oldpeak : num  2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
##  $ slope   : num  3 2 2 3 1 1 3 1 2 3 ...
##  $ ca      : Factor w/ 5 levels "?","0.0","1.0",..: 2 5 4 2 2 2 4 2 3 2 ...
##  $ thal    : Factor w/ 4 levels "?","3.0","6.0",..: 3 2 4 2 2 2 2 2 4 4 ...
##  $ hd      : int  0 2 1 0 0 0 3 0 2 1 ...

Cleaning The Data

raw.data[raw.data == "?"] <- NA

raw.data$sex <- factor(raw.data$sex , labels = c("F", "M"))

raw.data$cp <- as.factor(raw.data$cp)
raw.data$fbs <- as.factor(raw.data$fbs)
raw.data$restecg <- as.factor(raw.data$restecg)
raw.data$exang <- as.factor(raw.data$exang)
raw.data$slope <- as.factor(raw.data$slope)


raw.data$ca <- as.integer(raw.data$ca)
raw.data$ca <- as.factor(raw.data$ca)


raw.data$thal <- as.integer(raw.data$thal)
raw.data$thal <- as.factor(raw.data$thal)

raw.data$hd <- ifelse(test=raw.data$hd == 0, yes="Healthy", no="Unhealthy")
raw.data$hd <- as.factor(raw.data$hd)


str(raw.data)
## 'data.frame':    303 obs. of  14 variables:
##  $ age     : num  63 67 67 37 41 56 62 57 63 53 ...
##  $ sex     : Factor w/ 2 levels "F","M": 2 2 2 2 1 2 1 1 2 2 ...
##  $ cp      : Factor w/ 4 levels "1","2","3","4": 1 4 4 3 2 2 4 4 4 4 ...
##  $ trestbps: num  145 160 120 130 130 120 140 120 130 140 ...
##  $ chol    : num  233 286 229 250 204 236 268 354 254 203 ...
##  $ fbs     : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 2 ...
##  $ restecg : Factor w/ 3 levels "0","1","2": 3 3 3 1 3 1 3 1 3 3 ...
##  $ thalach : num  150 108 129 187 172 178 160 163 147 155 ...
##  $ exang   : Factor w/ 2 levels "0","1": 1 2 2 1 1 1 1 2 1 2 ...
##  $ oldpeak : num  2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
##  $ slope   : Factor w/ 3 levels "1","2","3": 3 2 2 3 1 1 3 1 2 3 ...
##  $ ca      : Factor w/ 4 levels "2","3","4","5": 1 4 3 1 1 1 3 1 2 1 ...
##  $ thal    : Factor w/ 3 levels "2","3","4": 2 1 3 1 1 1 1 1 3 3 ...
##  $ hd      : Factor w/ 2 levels "Healthy","Unhealthy": 1 2 2 1 1 1 2 1 2 2 ...

RandomForest

data.imputed <- rfImpute(hd~. ,data = raw.data,iter =6)
## ntree      OOB      1      2
##   300:  16.83% 12.80% 21.58%
## ntree      OOB      1      2
##   300:  18.48% 13.41% 24.46%
## ntree      OOB      1      2
##   300:  18.15% 14.02% 23.02%
## ntree      OOB      1      2
##   300:  18.15% 13.41% 23.74%
## ntree      OOB      1      2
##   300:  17.49% 12.80% 23.02%
## ntree      OOB      1      2
##   300:  17.82% 14.02% 22.30%
model <- randomForest(hd~. , data = data.imputed , proximity = TRUE)

model
## 
## Call:
##  randomForest(formula = hd ~ ., data = data.imputed, proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 17.16%
## Confusion matrix:
##           Healthy Unhealthy class.error
## Healthy       143        21   0.1280488
## Unhealthy      31       108   0.2230216
error.data <- data.frame(Trees = rep(1: nrow(model$err.rate), times = 3),
                             Type = rep(c("OOB", "Healthy", "Unhealthy"), 
                                        each = nrow(model$err.rate)),
                             Error = c(model$err.rate[,"OOB"],
                                       model$err.rate[,"Healthy"],
                                       model$err.rate[,"Unhealthy"])
                             
                             )

Plotting of errors

g <- ggplot(data = error.data)

g1 <- g + geom_line(aes(x = Trees, y = Error, color =Type))

g1