suppressMessages(library(ggplot2))
suppressMessages(library(cowplot))
## Warning: package 'cowplot' was built under R version 3.6.3
suppressMessages(library(randomForest))
## Warning: package 'randomForest' was built under R version 3.6.3
Loading data
url <- "http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
raw.data <- read.csv(url , header = FALSE)
colnames(raw.data) <- c(
"age",
"sex",# 0 = female, 1 = male
"cp", # chest pain
# 1 = typical angina,
# 2 = atypical angina,
# 3 = non-anginal pain,
# 4 = asymptomatic
"trestbps", # resting blood pressure (in mm Hg)
"chol", # serum cholestoral in mg/dl
"fbs", # fasting blood sugar if less than 120 mg/dl, 1 = TRUE, 0 = FALSE
"restecg", # resting electrocardiographic results
# 1 = normal
# 2 = having ST-T wave abnormality
# 3 = showing probable or definite left ventricular hypertrophy
"thalach", # maximum heart rate achieved
"exang", # exercise induced angina, 1 = yes, 0 = no
"oldpeak", # ST depression induced by exercise relative to rest
"slope", # the slope of the peak exercise ST segment
# 1 = upsloping
# 2 = flat
# 3 = downsloping
"ca", # number of major vessels (0-3) colored by fluoroscopy
"thal", # this is short of thalium heart scan
# 3 = normal (no cold spots)
# 6 = fixed defect (cold spots during rest and exercise)
# 7 = reversible defect (when cold spots only appear during exercise)
"hd" # (the predicted attribute) - diagnosis of heart disease
# 0 if less than or equal to 50% diameter narrowing
# 1 if greater than 50% diameter narrowing
)
str(raw.data)
## 'data.frame': 303 obs. of 14 variables:
## $ age : num 63 67 67 37 41 56 62 57 63 53 ...
## $ sex : num 1 1 1 1 0 1 0 0 1 1 ...
## $ cp : num 1 4 4 3 2 2 4 4 4 4 ...
## $ trestbps: num 145 160 120 130 130 120 140 120 130 140 ...
## $ chol : num 233 286 229 250 204 236 268 354 254 203 ...
## $ fbs : num 1 0 0 0 0 0 0 0 0 1 ...
## $ restecg : num 2 2 2 0 2 0 2 0 2 2 ...
## $ thalach : num 150 108 129 187 172 178 160 163 147 155 ...
## $ exang : num 0 1 1 0 0 0 0 1 0 1 ...
## $ oldpeak : num 2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
## $ slope : num 3 2 2 3 1 1 3 1 2 3 ...
## $ ca : Factor w/ 5 levels "?","0.0","1.0",..: 2 5 4 2 2 2 4 2 3 2 ...
## $ thal : Factor w/ 4 levels "?","3.0","6.0",..: 3 2 4 2 2 2 2 2 4 4 ...
## $ hd : int 0 2 1 0 0 0 3 0 2 1 ...
Cleaning The Data
raw.data[raw.data == "?"] <- NA
raw.data$sex <- factor(raw.data$sex , labels = c("F", "M"))
raw.data$cp <- as.factor(raw.data$cp)
raw.data$fbs <- as.factor(raw.data$fbs)
raw.data$restecg <- as.factor(raw.data$restecg)
raw.data$exang <- as.factor(raw.data$exang)
raw.data$slope <- as.factor(raw.data$slope)
raw.data$ca <- as.integer(raw.data$ca)
raw.data$ca <- as.factor(raw.data$ca)
raw.data$thal <- as.integer(raw.data$thal)
raw.data$thal <- as.factor(raw.data$thal)
raw.data$hd <- ifelse(test=raw.data$hd == 0, yes="Healthy", no="Unhealthy")
raw.data$hd <- as.factor(raw.data$hd)
str(raw.data)
## 'data.frame': 303 obs. of 14 variables:
## $ age : num 63 67 67 37 41 56 62 57 63 53 ...
## $ sex : Factor w/ 2 levels "F","M": 2 2 2 2 1 2 1 1 2 2 ...
## $ cp : Factor w/ 4 levels "1","2","3","4": 1 4 4 3 2 2 4 4 4 4 ...
## $ trestbps: num 145 160 120 130 130 120 140 120 130 140 ...
## $ chol : num 233 286 229 250 204 236 268 354 254 203 ...
## $ fbs : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 2 ...
## $ restecg : Factor w/ 3 levels "0","1","2": 3 3 3 1 3 1 3 1 3 3 ...
## $ thalach : num 150 108 129 187 172 178 160 163 147 155 ...
## $ exang : Factor w/ 2 levels "0","1": 1 2 2 1 1 1 1 2 1 2 ...
## $ oldpeak : num 2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
## $ slope : Factor w/ 3 levels "1","2","3": 3 2 2 3 1 1 3 1 2 3 ...
## $ ca : Factor w/ 4 levels "2","3","4","5": 1 4 3 1 1 1 3 1 2 1 ...
## $ thal : Factor w/ 3 levels "2","3","4": 2 1 3 1 1 1 1 1 3 3 ...
## $ hd : Factor w/ 2 levels "Healthy","Unhealthy": 1 2 2 1 1 1 2 1 2 2 ...
RandomForest
data.imputed <- rfImpute(hd~. ,data = raw.data,iter =6)
## ntree OOB 1 2
## 300: 16.83% 12.80% 21.58%
## ntree OOB 1 2
## 300: 18.48% 13.41% 24.46%
## ntree OOB 1 2
## 300: 18.15% 14.02% 23.02%
## ntree OOB 1 2
## 300: 18.15% 13.41% 23.74%
## ntree OOB 1 2
## 300: 17.49% 12.80% 23.02%
## ntree OOB 1 2
## 300: 17.82% 14.02% 22.30%
model <- randomForest(hd~. , data = data.imputed , proximity = TRUE)
model
##
## Call:
## randomForest(formula = hd ~ ., data = data.imputed, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 17.16%
## Confusion matrix:
## Healthy Unhealthy class.error
## Healthy 143 21 0.1280488
## Unhealthy 31 108 0.2230216
error.data <- data.frame(Trees = rep(1: nrow(model$err.rate), times = 3),
Type = rep(c("OOB", "Healthy", "Unhealthy"),
each = nrow(model$err.rate)),
Error = c(model$err.rate[,"OOB"],
model$err.rate[,"Healthy"],
model$err.rate[,"Unhealthy"])
)
Plotting of errors
g <- ggplot(data = error.data)
g1 <- g + geom_line(aes(x = Trees, y = Error, color =Type))
g1
