library(ROCR)
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
#Load Data
dataset <- "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
data <- read.csv(dataset, header = FALSE, na.strings = "?")
colnames(data) <- c("age", "sex", "cp", "trestbps", "chol", "fbs", "restecg",
"thalach", "exang", "oldpeak", "slope", "ca", "thal", "num")
data <- na.omit(data)
# Quick exploration
str(data)
## 'data.frame': 297 obs. of 14 variables:
## $ age : num 63 67 67 37 41 56 62 57 63 53 ...
## $ sex : num 1 1 1 1 0 1 0 0 1 1 ...
## $ cp : num 1 4 4 3 2 2 4 4 4 4 ...
## $ trestbps: num 145 160 120 130 130 120 140 120 130 140 ...
## $ chol : num 233 286 229 250 204 236 268 354 254 203 ...
## $ fbs : num 1 0 0 0 0 0 0 0 0 1 ...
## $ restecg : num 2 2 2 0 2 0 2 0 2 2 ...
## $ thalach : num 150 108 129 187 172 178 160 163 147 155 ...
## $ exang : num 0 1 1 0 0 0 0 1 0 1 ...
## $ oldpeak : num 2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
## $ slope : num 3 2 2 3 1 1 3 1 2 3 ...
## $ ca : num 0 3 2 0 0 0 2 0 1 0 ...
## $ thal : num 6 3 7 3 3 3 3 3 7 7 ...
## $ num : int 0 2 1 0 0 0 3 0 2 1 ...
## - attr(*, "na.action")= 'omit' Named int [1:6] 88 167 193 267 288 303
## ..- attr(*, "names")= chr [1:6] "88" "167" "193" "267" ...
summary(data)
## age sex cp trestbps
## Min. :29.00 Min. :0.0000 Min. :1.000 Min. : 94.0
## 1st Qu.:48.00 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:120.0
## Median :56.00 Median :1.0000 Median :3.000 Median :130.0
## Mean :54.54 Mean :0.6768 Mean :3.158 Mean :131.7
## 3rd Qu.:61.00 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:140.0
## Max. :77.00 Max. :1.0000 Max. :4.000 Max. :200.0
## chol fbs restecg thalach
## Min. :126.0 Min. :0.0000 Min. :0.0000 Min. : 71.0
## 1st Qu.:211.0 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:133.0
## Median :243.0 Median :0.0000 Median :1.0000 Median :153.0
## Mean :247.4 Mean :0.1448 Mean :0.9966 Mean :149.6
## 3rd Qu.:276.0 3rd Qu.:0.0000 3rd Qu.:2.0000 3rd Qu.:166.0
## Max. :564.0 Max. :1.0000 Max. :2.0000 Max. :202.0
## exang oldpeak slope ca
## Min. :0.0000 Min. :0.000 Min. :1.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:1.000 1st Qu.:0.0000
## Median :0.0000 Median :0.800 Median :2.000 Median :0.0000
## Mean :0.3266 Mean :1.056 Mean :1.603 Mean :0.6768
## 3rd Qu.:1.0000 3rd Qu.:1.600 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :6.200 Max. :3.000 Max. :3.0000
## thal num
## Min. :3.000 Min. :0.0000
## 1st Qu.:3.000 1st Qu.:0.0000
## Median :3.000 Median :0.0000
## Mean :4.731 Mean :0.9461
## 3rd Qu.:7.000 3rd Qu.:2.0000
## Max. :7.000 Max. :4.0000
# Convert appropriate variables
data$ca <- as.factor(data$ca)
data$thal <- as.factor(data$thal)
# Convert 'num' to binary outcome: 0 = No heart disease, 1 = Presence of heart disease
data$hd <- ifelse(data$num > 0, 1, 0)
str(data)
## 'data.frame': 297 obs. of 15 variables:
## $ age : num 63 67 67 37 41 56 62 57 63 53 ...
## $ sex : num 1 1 1 1 0 1 0 0 1 1 ...
## $ cp : num 1 4 4 3 2 2 4 4 4 4 ...
## $ trestbps: num 145 160 120 130 130 120 140 120 130 140 ...
## $ chol : num 233 286 229 250 204 236 268 354 254 203 ...
## $ fbs : num 1 0 0 0 0 0 0 0 0 1 ...
## $ restecg : num 2 2 2 0 2 0 2 0 2 2 ...
## $ thalach : num 150 108 129 187 172 178 160 163 147 155 ...
## $ exang : num 0 1 1 0 0 0 0 1 0 1 ...
## $ oldpeak : num 2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
## $ slope : num 3 2 2 3 1 1 3 1 2 3 ...
## $ ca : Factor w/ 4 levels "0","1","2","3": 1 4 3 1 1 1 3 1 2 1 ...
## $ thal : Factor w/ 3 levels "3","6","7": 2 1 3 1 1 1 1 1 3 3 ...
## $ num : int 0 2 1 0 0 0 3 0 2 1 ...
## $ hd : num 0 1 1 0 0 0 1 0 1 1 ...
## - attr(*, "na.action")= 'omit' Named int [1:6] 88 167 193 267 288 303
## ..- attr(*, "names")= chr [1:6] "88" "167" "193" "267" ...
summary(data$hd)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.4613 1.0000 1.0000
#2: Data Split
set.seed(999)
train_index <- sample(nrow(data), nrow(data) * 0.7)
data_train <- data[train_index, ]
data_test <- data[-train_index, ]
#Plot
boxplot(data_train$age,
main = "Age",
ylab = "Value")

boxplot(data_train$trestbps,
main = "Resting blood pressure (mmHg)",
ylab = "Value")

boxplot(data_train$chol,
main = "Serum cholestoral (mg/dl)",
ylab = "Value")

boxplot(data_train$thalach,
main = "Maximum heart rate achieved",
ylab = "Value")

#3. Logistic Regression (GLM) + Stepwise Selection
glm_full <- glm(hd ~ . - num, family = binomial, data = data_train)
summary(glm_full)
##
## Call:
## glm(formula = hd ~ . - num, family = binomial, data = data_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.569135 3.608611 -2.098 0.03595 *
## age -0.014443 0.028326 -0.510 0.61012
## sex 1.591373 0.609023 2.613 0.00898 **
## cp 0.664545 0.230995 2.877 0.00402 **
## trestbps 0.027631 0.013368 2.067 0.03874 *
## chol 0.006070 0.004437 1.368 0.17131
## fbs -0.909958 0.633717 -1.436 0.15103
## restecg 0.130308 0.225336 0.578 0.56307
## thalach -0.022991 0.012262 -1.875 0.06079 .
## exang 0.585059 0.485539 1.205 0.22822
## oldpeak 0.084474 0.242893 0.348 0.72800
## slope 0.947381 0.453934 2.087 0.03688 *
## ca1 1.821914 0.575186 3.168 0.00154 **
## ca2 3.443161 0.843904 4.080 4.5e-05 ***
## ca3 1.230336 0.922225 1.334 0.18217
## thal6 0.167115 0.898188 0.186 0.85240
## thal7 1.412673 0.478007 2.955 0.00312 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 286.57 on 206 degrees of freedom
## Residual deviance: 145.25 on 190 degrees of freedom
## AIC: 179.25
##
## Number of Fisher Scoring iterations: 6
# Stepwise selection for best model
glm_step <- step(glm_full)
## Start: AIC=179.25
## hd ~ (age + sex + cp + trestbps + chol + fbs + restecg + thalach +
## exang + oldpeak + slope + ca + thal + num) - num
##
## Df Deviance AIC
## - oldpeak 1 145.37 177.37
## - age 1 145.51 177.51
## - restecg 1 145.58 177.58
## - exang 1 146.68 178.68
## - chol 1 147.13 179.13
## <none> 145.25 179.25
## - fbs 1 147.41 179.41
## - thalach 1 148.95 180.95
## - slope 1 149.55 181.55
## - trestbps 1 149.72 181.72
## - sex 1 152.58 184.58
## - thal 2 154.84 184.84
## - cp 1 154.29 186.29
## - ca 3 170.70 198.70
##
## Step: AIC=177.37
## hd ~ age + sex + cp + trestbps + chol + fbs + restecg + thalach +
## exang + slope + ca + thal
##
## Df Deviance AIC
## - age 1 145.68 175.68
## - restecg 1 145.70 175.70
## - exang 1 146.81 176.81
## - chol 1 147.37 177.37
## <none> 145.37 177.37
## - fbs 1 147.70 177.70
## - thalach 1 149.23 179.23
## - trestbps 1 150.31 180.31
## - slope 1 152.67 182.67
## - thal 2 154.93 182.93
## - sex 1 153.38 183.38
## - cp 1 154.55 184.55
## - ca 3 172.36 198.36
##
## Step: AIC=175.68
## hd ~ sex + cp + trestbps + chol + fbs + restecg + thalach + exang +
## slope + ca + thal
##
## Df Deviance AIC
## - restecg 1 145.98 173.98
## - exang 1 147.15 175.15
## - chol 1 147.40 175.40
## <none> 145.68 175.68
## - fbs 1 148.12 176.12
## - thalach 1 149.24 177.24
## - trestbps 1 150.32 178.32
## - slope 1 153.00 181.00
## - thal 2 155.26 181.26
## - sex 1 153.81 181.81
## - cp 1 155.02 183.02
## - ca 3 173.29 197.29
##
## Step: AIC=173.98
## hd ~ sex + cp + trestbps + chol + fbs + thalach + exang + slope +
## ca + thal
##
## Df Deviance AIC
## - exang 1 147.50 173.50
## <none> 145.98 173.98
## - chol 1 148.08 174.08
## - fbs 1 148.41 174.41
## - thalach 1 149.60 175.60
## - trestbps 1 151.44 177.44
## - thal 2 155.40 179.40
## - slope 1 153.51 179.51
## - sex 1 154.31 180.31
## - cp 1 155.42 181.42
## - ca 3 173.82 195.82
##
## Step: AIC=173.5
## hd ~ sex + cp + trestbps + chol + fbs + thalach + slope + ca +
## thal
##
## Df Deviance AIC
## - chol 1 149.40 173.40
## <none> 147.50 173.50
## - fbs 1 149.85 173.85
## - thalach 1 152.33 176.33
## - trestbps 1 153.01 177.01
## - slope 1 155.50 179.50
## - sex 1 155.58 179.58
## - thal 2 157.89 179.89
## - cp 1 159.75 183.75
## - ca 3 175.29 195.29
##
## Step: AIC=173.4
## hd ~ sex + cp + trestbps + fbs + thalach + slope + ca + thal
##
## Df Deviance AIC
## <none> 149.40 173.40
## - fbs 1 151.74 173.74
## - thalach 1 153.98 175.98
## - trestbps 1 154.96 176.96
## - sex 1 155.91 177.91
## - slope 1 157.41 179.41
## - thal 2 159.74 179.74
## - cp 1 161.45 183.45
## - ca 3 177.58 195.58
summary(glm_step)
##
## Call:
## glm(formula = hd ~ sex + cp + trestbps + fbs + thalach + slope +
## ca + thal, family = binomial, data = data_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.62219 3.03065 -2.185 0.02888 *
## sex 1.37193 0.55703 2.463 0.01378 *
## cp 0.74486 0.22618 3.293 0.00099 ***
## trestbps 0.02747 0.01198 2.293 0.02186 *
## fbs -0.92664 0.61899 -1.497 0.13439
## thalach -0.02324 0.01124 -2.069 0.03858 *
## slope 1.04821 0.37409 2.802 0.00508 **
## ca1 1.72841 0.54314 3.182 0.00146 **
## ca2 3.37986 0.80504 4.198 2.69e-05 ***
## ca3 1.12695 0.90086 1.251 0.21095
## thal6 0.24116 0.85348 0.283 0.77752
## thal7 1.46410 0.47433 3.087 0.00202 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 286.57 on 206 degrees of freedom
## Residual deviance: 149.40 on 195 degrees of freedom
## AIC: 173.4
##
## Number of Fisher Scoring iterations: 6
best_glm <- glm_step
# Model fit criteria
AIC(best_glm)
## [1] 173.4021
BIC(best_glm)
## [1] 213.3947
best_glm$deviance
## [1] 149.4021
#4. GLM Evaluation: Train & Test
##Training set
pred_glm_train<- predict(best_glm, type="response")
pred_train <- prediction(pred_glm_train, data_train$hd)
perf_train <- performance(pred_train, "tpr", "fpr")
auc_glm_train <- unlist(slot(performance(pred_train, "auc"), "y.values"))
auc_glm_train
## [1] 0.9157314
# Confusion matrix (training)
pred_glm_train_class <- (pred_glm_train > 0.5)*1
table(data_train$hd, pred_glm_train_class, dnn = c("True", "Pred"))
## Pred
## True 0 1
## 0 97 11
## 1 18 81
MR_glm_train <- mean(pred_glm_train_class != data_train$hd) * 100
MR_glm_train
## [1] 14.00966
#Testing set
pred_glm_test <- predict(best_glm,newdata = data_test, type="response")
pred_test <- prediction(pred_glm_test, data_test$hd)
perf_test <- performance(pred_test, "tpr", "fpr")
auc_glm_test <- unlist(slot(performance(pred_test, "auc"), "y.values"))
auc_glm_test
## [1] 0.9291498
# Confusion matrix (test)
pred_glm_test_class <- (pred_glm_test > 0.5)*1
table(data_test$hd, pred_glm_test_class, dnn = c("True", "Pred"))
## Pred
## True 0 1
## 0 46 6
## 1 9 29
MR_glm_test <- mean(pred_glm_test_class != data_test$hd) * 100
MR_glm_test
## [1] 16.66667
#5. Decision Tree Model with Asymmetric cost
tree_rpart <- rpart(formula = hd ~ . -num ,
data = data_train,
method = "class",
parms = list(loss=matrix(c(0,1,5,0), nrow = 2)))
prp(tree_rpart, extra = 1, main = "Asymmetric Cost Decision Tree")

# Predict on training set
tree_train.pred <- predict(tree_rpart, data_train, type="class")
table(data_train$hd, tree_train.pred, dnn=c("Truth","Predicted"))
## Predicted
## Truth 0 1
## 0 107 1
## 1 50 49
MR_tree_train <- mean(tree_train.pred != data_train$hd) * 100
MR_tree_train
## [1] 24.63768
#AUC on training set
tree_train_prob_rpart = predict(tree_rpart, data_train, type="prob")
in.pred = prediction(tree_train_prob_rpart[,2], data_train$hd)
in.perf = performance(in.pred, "tpr", "fpr")
auc_tree_train <- slot(performance(in.pred, "auc"), "y.values")[[1]]
auc_tree_train
## [1] 0.836373
# Predict on testing set
tree_test.pred <-predict(tree_rpart, data_test, type="class")
table(data_test$hd, tree_test.pred, dnn=c("Truth","Predicted"))
## Predicted
## Truth 0 1
## 0 49 3
## 1 19 19
MR_tree_test <- mean(tree_test.pred != data_test$hd) * 100
MR_tree_test
## [1] 24.44444
#AUC testing
tree_test_prob_rpart = predict(tree_rpart, data_test, type="prob")
out.pred = prediction(tree_test_prob_rpart[,2], data_test$hd)
out.perf = performance(out.pred, "tpr", "fpr")
auc_tree_test <- slot(performance(out.pred, "auc"), "y.values")[[1]]
auc_tree_test
## [1] 0.7644231
#6. Random Forest Model
rf_model <- randomForest(as.factor(hd) ~ . - num,
data = data_train,
ntree = 500,
importance = TRUE)
rf_model
##
## Call:
## randomForest(formula = as.factor(hd) ~ . - num, data = data_train, ntree = 500, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 22.22%
## Confusion matrix:
## 0 1 class.error
## 0 85 23 0.2129630
## 1 23 76 0.2323232
importance(rf_model)
## 0 1 MeanDecreaseAccuracy MeanDecreaseGini
## age 9.3101980 4.8773395 10.4371179 10.9105044
## sex 5.3909008 4.1253916 7.1009187 2.6046158
## cp 11.6194203 13.5854222 17.5240765 12.0072101
## trestbps 4.7421159 -0.6345170 3.0793364 7.6730721
## chol -0.1620141 -0.8939027 -0.7180373 8.6158551
## fbs -0.7039048 -3.7918235 -3.2135240 0.9637521
## restecg -1.8937251 0.7020637 -0.5096327 1.7465248
## thalach 6.0739366 4.9705288 8.3384264 12.1622559
## exang 1.5529589 1.7575048 2.1749294 3.3946755
## oldpeak 2.8281172 7.5051353 7.6249840 8.7421008
## slope 1.0214109 11.5709292 9.9548324 5.5935210
## ca 13.4570932 12.6705281 17.4914990 10.9835439
## thal 18.0677003 16.3767455 22.8287235 16.4018149
# Predict on train set
pred_rf_train_prob <- predict(rf_model, data_train, type = "prob")[, 2]
pred_rf_train_roc <- prediction(pred_rf_train_prob, data_train$hd)
perf_rf_train <- performance(pred_rf_train_roc, "tpr", "fpr")
auc_rf_train <- performance(pred_rf_train_roc, "auc")@y.values[[1]]
auc_rf_train
## [1] 1
# Predict on test set
pred_rf_test_prob <- predict(rf_model, data_test, type = "prob")[, 2]
pred_rf_test_roc <- prediction(pred_rf_test_prob, data_test$hd)
perf_rf_test <- performance(pred_rf_test_roc, "tpr", "fpr")
auc_rf_test <- performance(pred_rf_test_roc, "auc")@y.values[[1]]
auc_rf_test
## [1] 0.9228239
# Confusion matrix (train) for RF
pred_rf_train_class <- ifelse(pred_rf_train_prob > 0.5, 1, 0)
table(Truth = data_train$hd, Predicted = pred_rf_train_class)
## Predicted
## Truth 0 1
## 0 108 0
## 1 0 99
mr_rf_train <- mean(pred_rf_train_class != data_train$hd) *100
mr_rf_train
## [1] 0
# Confusion matrix (test) for RF
pred_rf_test_class <- ifelse(pred_rf_test_prob > 0.5, 1, 0)
table(Truth = data_test$hd, Predicted = pred_rf_test_class)
## Predicted
## Truth 0 1
## 0 49 3
## 1 7 31
mr_rf_test <- mean(pred_rf_test_class != data_test$hd) *100
mr_rf_test
## [1] 11.11111
#7. Plot ROC Curves for All Models (Training Set)
plot(perf_train,
col = "red", lwd = 2,
main = "ROC Curves (Training Data): GLM vs Tree vs Random Forest")
plot(in.perf,
col = "blue", lwd = 2,
add = TRUE)
plot(perf_rf_train,
col = "green", lwd = 2,
add = TRUE)
legend("bottomright",
legend = c(
paste0("Logistic Regression (AUC = ", round(auc_glm_train, 3), ")"),
paste0("Decision Tree (AUC = ", round(auc_tree_train, 3), ")"),
paste0("Random Forest (AUC = ", round(auc_rf_train, 3), ")")
),
col = c("red", "blue", "green"),
lwd = 2)

#8. Plot ROC Curves for All Models (Testing Set)
plot(perf_test,
col = "red", lwd = 2,
main = "ROC Curves (Testing Data): GLM vs Tree vs Random Forest")
plot(out.perf,
col = "blue", lwd = 2,
add = TRUE)
plot(perf_rf_test,
col = "green", lwd = 2,
add = TRUE)
legend("bottomright",
legend = c(
paste0("Logistic Regression (AUC = ", round(auc_glm_test, 3), ")"),
paste0("Decision Tree (AUC = ", round(auc_tree_test, 3), ")"),
paste0("Random Forest (AUC = ", round(auc_rf_test, 3), ")")
),
col = c("red", "blue", "green"),
lwd = 2)

#9) Bar Chart
# Vector of AUC values from your results (testing)
auc_values_test <- c(
GLM = auc_glm_test,
DecisionTree = auc_tree_test,
RandomForest = auc_rf_test)
# Create bar plot
bar_positions_test <- barplot(
auc_values_test,
ylim = c(0, 1),
main = "Model AUC Comparison (Test Set)",
ylab = "AUC",
xlab = "Model",
col = c("skyblue", "pink", "darkseagreen")
)
# Add numeric labels on top of bars (rounded to 3 decimals)
text(x = bar_positions_test,
y = auc_values_test,
labels = round(auc_values_test, 3),
pos = 3, # above the bar
cex = 0.8)

# Vector of AUC values from your results (training)
auc_values_train <- c(
GLM = auc_glm_train,
DecisionTree = auc_tree_train,
RandomForest = auc_rf_train)
# Create bar plot
bar_positions_train <- barplot(
auc_values_train,
ylim = c(0, 1),
main = "Model AUC Comparison (Training Set)",
ylab = "AUC",
xlab = "Model",
col = c("skyblue", "pink", "darkseagreen")
)
# Add numeric labels on top of bars (rounded to 3 decimals)
text(x = bar_positions_train,
y = auc_values_train,
labels = round(auc_values_train, 3),
pos = 3, # above the bar
cex = 0.8)

#10) Bar Chart of Misclassification Rates
# Vector of misclassification rates from your results (testing)
mr_values_test <- c(
GLM = MR_glm_test,
DecisionTree = MR_tree_test,
RandomForest = mr_rf_test
)
# Create bar plot
bar_positions_testmr <- barplot(
mr_values_test,
ylim = c(0, max(mr_values_test) + 5),
main = "Model Misclassification Rate Comparison (Test Set)",
ylab = "Misclassification Rate (%)",
xlab = "Model",
col = c("skyblue", "pink", "darkseagreen")
)
# Add numeric labels on top of bars (rounded to 2 decimals)
text(
x = bar_positions_testmr,
y = mr_values_test,
labels = round(mr_values_test, 2),
pos = 3,
cex = 0.8)

# Vector of misclassification rates from your results (training)
mr_values_train <- c(
GLM = MR_glm_train,
DecisionTree = MR_tree_train,
RandomForest = mr_rf_train
)
# Create bar plot
bar_positions_trainmr <- barplot(
mr_values_train,
ylim = c(0, max(mr_values_train) + 5),
main = "Model Misclassification Rate Comparison (Train Set)",
ylab = "Misclassification Rate (%)",
xlab = "Model",
col = c("skyblue", "pink", "darkseagreen")
)
# Add numeric labels on top of bars (rounded to 2 decimals)
text(
x = bar_positions_trainmr,
y = mr_values_train,
labels = round(mr_values_train, 2),
pos = 3,
cex = 0.8)

#11) Additional
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## Loading required package: lattice
control <- trainControl(method = "cv", number = 10)
rf_cv <- train(as.factor(hd) ~ . - num, data = data, method = "rf",
trControl = control, ntree = 500)
rf_cv
## Random Forest
##
## 297 samples
## 14 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 267, 267, 267, 267, 267, 268, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8213793 0.6381593
## 9 0.7772414 0.5502964
## 16 0.7740230 0.5431606
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
#Most importance items
varImp(rf_cv)
## rf variable importance
##
## Overall
## thalach 100.000
## cp 93.190
## oldpeak 80.776
## thal7 78.625
## age 66.932
## chol 56.579
## trestbps 52.418
## exang 40.202
## slope 38.378
## sex 25.431
## ca1 21.385
## ca2 18.781
## restecg 10.297
## ca3 7.119
## fbs 2.137
## thal6 0.000
#Confuse maxtrix
pred_cv <- predict(rf_cv, data)
confusionMatrix(pred_cv, as.factor(data$hd))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 160 3
## 1 0 134
##
## Accuracy : 0.9899
## 95% CI : (0.9708, 0.9979)
## No Information Rate : 0.5387
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9796
##
## Mcnemar's Test P-Value : 0.2482
##
## Sensitivity : 1.0000
## Specificity : 0.9781
## Pos Pred Value : 0.9816
## Neg Pred Value : 1.0000
## Prevalence : 0.5387
## Detection Rate : 0.5387
## Detection Prevalence : 0.5488
## Balanced Accuracy : 0.9891
##
## 'Positive' Class : 0
##
#ROC
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
rf_cv_prob <- predict(rf_cv, data, type = "prob")[,2]
roc_cv <- roc(data$hd, rf_cv_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_cross <- auc(roc_cv)
# Plot ROC curve - Cross Validated
plot(roc_cv, col="darkgreen", lwd=3, main="Cross-Validated ROC Curve")
legend("bottomright",
legend = paste0("AUC = ", round(auc_cross, 3)),
col = "darkgreen",
lwd = 3)
