Final.r

library(ROCR)
library(rpart)
library(rpart.plot)
library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

#Load Data
dataset <- "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
data <- read.csv(dataset, header = FALSE, na.strings = "?")
colnames(data) <- c("age", "sex", "cp", "trestbps", "chol", "fbs", "restecg",
                    "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num")
data <- na.omit(data)
# Quick exploration
str(data)

## 'data.frame':    297 obs. of  14 variables:
##  $ age     : num  63 67 67 37 41 56 62 57 63 53 ...
##  $ sex     : num  1 1 1 1 0 1 0 0 1 1 ...
##  $ cp      : num  1 4 4 3 2 2 4 4 4 4 ...
##  $ trestbps: num  145 160 120 130 130 120 140 120 130 140 ...
##  $ chol    : num  233 286 229 250 204 236 268 354 254 203 ...
##  $ fbs     : num  1 0 0 0 0 0 0 0 0 1 ...
##  $ restecg : num  2 2 2 0 2 0 2 0 2 2 ...
##  $ thalach : num  150 108 129 187 172 178 160 163 147 155 ...
##  $ exang   : num  0 1 1 0 0 0 0 1 0 1 ...
##  $ oldpeak : num  2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
##  $ slope   : num  3 2 2 3 1 1 3 1 2 3 ...
##  $ ca      : num  0 3 2 0 0 0 2 0 1 0 ...
##  $ thal    : num  6 3 7 3 3 3 3 3 7 7 ...
##  $ num     : int  0 2 1 0 0 0 3 0 2 1 ...
##  - attr(*, "na.action")= 'omit' Named int [1:6] 88 167 193 267 288 303
##   ..- attr(*, "names")= chr [1:6] "88" "167" "193" "267" ...

summary(data)

##       age             sex               cp           trestbps    
##  Min.   :29.00   Min.   :0.0000   Min.   :1.000   Min.   : 94.0  
##  1st Qu.:48.00   1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:120.0  
##  Median :56.00   Median :1.0000   Median :3.000   Median :130.0  
##  Mean   :54.54   Mean   :0.6768   Mean   :3.158   Mean   :131.7  
##  3rd Qu.:61.00   3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:140.0  
##  Max.   :77.00   Max.   :1.0000   Max.   :4.000   Max.   :200.0  
##       chol            fbs            restecg          thalach     
##  Min.   :126.0   Min.   :0.0000   Min.   :0.0000   Min.   : 71.0  
##  1st Qu.:211.0   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:133.0  
##  Median :243.0   Median :0.0000   Median :1.0000   Median :153.0  
##  Mean   :247.4   Mean   :0.1448   Mean   :0.9966   Mean   :149.6  
##  3rd Qu.:276.0   3rd Qu.:0.0000   3rd Qu.:2.0000   3rd Qu.:166.0  
##  Max.   :564.0   Max.   :1.0000   Max.   :2.0000   Max.   :202.0  
##      exang           oldpeak          slope             ca        
##  Min.   :0.0000   Min.   :0.000   Min.   :1.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.800   Median :2.000   Median :0.0000  
##  Mean   :0.3266   Mean   :1.056   Mean   :1.603   Mean   :0.6768  
##  3rd Qu.:1.0000   3rd Qu.:1.600   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :6.200   Max.   :3.000   Max.   :3.0000  
##       thal            num        
##  Min.   :3.000   Min.   :0.0000  
##  1st Qu.:3.000   1st Qu.:0.0000  
##  Median :3.000   Median :0.0000  
##  Mean   :4.731   Mean   :0.9461  
##  3rd Qu.:7.000   3rd Qu.:2.0000  
##  Max.   :7.000   Max.   :4.0000

# Convert appropriate variables
data$ca   <- as.factor(data$ca)
data$thal <- as.factor(data$thal)

# Convert 'num' to binary outcome: 0 = No heart disease, 1 = Presence of heart disease
data$hd <- ifelse(data$num > 0, 1, 0)
str(data)

## 'data.frame':    297 obs. of  15 variables:
##  $ age     : num  63 67 67 37 41 56 62 57 63 53 ...
##  $ sex     : num  1 1 1 1 0 1 0 0 1 1 ...
##  $ cp      : num  1 4 4 3 2 2 4 4 4 4 ...
##  $ trestbps: num  145 160 120 130 130 120 140 120 130 140 ...
##  $ chol    : num  233 286 229 250 204 236 268 354 254 203 ...
##  $ fbs     : num  1 0 0 0 0 0 0 0 0 1 ...
##  $ restecg : num  2 2 2 0 2 0 2 0 2 2 ...
##  $ thalach : num  150 108 129 187 172 178 160 163 147 155 ...
##  $ exang   : num  0 1 1 0 0 0 0 1 0 1 ...
##  $ oldpeak : num  2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
##  $ slope   : num  3 2 2 3 1 1 3 1 2 3 ...
##  $ ca      : Factor w/ 4 levels "0","1","2","3": 1 4 3 1 1 1 3 1 2 1 ...
##  $ thal    : Factor w/ 3 levels "3","6","7": 2 1 3 1 1 1 1 1 3 3 ...
##  $ num     : int  0 2 1 0 0 0 3 0 2 1 ...
##  $ hd      : num  0 1 1 0 0 0 1 0 1 1 ...
##  - attr(*, "na.action")= 'omit' Named int [1:6] 88 167 193 267 288 303
##   ..- attr(*, "names")= chr [1:6] "88" "167" "193" "267" ...

summary(data$hd)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.4613  1.0000  1.0000

#2: Data Split
set.seed(999)
train_index <- sample(nrow(data), nrow(data) * 0.7)
data_train <- data[train_index, ]
data_test  <- data[-train_index, ]

#Plot
boxplot(data_train$age, 
        main = "Age",
        ylab = "Value")

boxplot(data_train$trestbps,
        main = "Resting blood pressure (mmHg)",
        ylab = "Value")

boxplot(data_train$chol,
        main = "Serum cholestoral (mg/dl)",
        ylab = "Value")

boxplot(data_train$thalach,
        main = "Maximum heart rate achieved",
        ylab = "Value")

#3. Logistic Regression (GLM) + Stepwise Selection
glm_full <- glm(hd ~ . - num, family = binomial, data = data_train)
summary(glm_full)

## 
## Call:
## glm(formula = hd ~ . - num, family = binomial, data = data_train)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.569135   3.608611  -2.098  0.03595 *  
## age         -0.014443   0.028326  -0.510  0.61012    
## sex          1.591373   0.609023   2.613  0.00898 ** 
## cp           0.664545   0.230995   2.877  0.00402 ** 
## trestbps     0.027631   0.013368   2.067  0.03874 *  
## chol         0.006070   0.004437   1.368  0.17131    
## fbs         -0.909958   0.633717  -1.436  0.15103    
## restecg      0.130308   0.225336   0.578  0.56307    
## thalach     -0.022991   0.012262  -1.875  0.06079 .  
## exang        0.585059   0.485539   1.205  0.22822    
## oldpeak      0.084474   0.242893   0.348  0.72800    
## slope        0.947381   0.453934   2.087  0.03688 *  
## ca1          1.821914   0.575186   3.168  0.00154 ** 
## ca2          3.443161   0.843904   4.080  4.5e-05 ***
## ca3          1.230336   0.922225   1.334  0.18217    
## thal6        0.167115   0.898188   0.186  0.85240    
## thal7        1.412673   0.478007   2.955  0.00312 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 286.57  on 206  degrees of freedom
## Residual deviance: 145.25  on 190  degrees of freedom
## AIC: 179.25
## 
## Number of Fisher Scoring iterations: 6

# Stepwise selection for best model
glm_step <- step(glm_full)

## Start:  AIC=179.25
## hd ~ (age + sex + cp + trestbps + chol + fbs + restecg + thalach + 
##     exang + oldpeak + slope + ca + thal + num) - num
## 
##            Df Deviance    AIC
## - oldpeak   1   145.37 177.37
## - age       1   145.51 177.51
## - restecg   1   145.58 177.58
## - exang     1   146.68 178.68
## - chol      1   147.13 179.13
## <none>          145.25 179.25
## - fbs       1   147.41 179.41
## - thalach   1   148.95 180.95
## - slope     1   149.55 181.55
## - trestbps  1   149.72 181.72
## - sex       1   152.58 184.58
## - thal      2   154.84 184.84
## - cp        1   154.29 186.29
## - ca        3   170.70 198.70
## 
## Step:  AIC=177.37
## hd ~ age + sex + cp + trestbps + chol + fbs + restecg + thalach + 
##     exang + slope + ca + thal
## 
##            Df Deviance    AIC
## - age       1   145.68 175.68
## - restecg   1   145.70 175.70
## - exang     1   146.81 176.81
## - chol      1   147.37 177.37
## <none>          145.37 177.37
## - fbs       1   147.70 177.70
## - thalach   1   149.23 179.23
## - trestbps  1   150.31 180.31
## - slope     1   152.67 182.67
## - thal      2   154.93 182.93
## - sex       1   153.38 183.38
## - cp        1   154.55 184.55
## - ca        3   172.36 198.36
## 
## Step:  AIC=175.68
## hd ~ sex + cp + trestbps + chol + fbs + restecg + thalach + exang + 
##     slope + ca + thal
## 
##            Df Deviance    AIC
## - restecg   1   145.98 173.98
## - exang     1   147.15 175.15
## - chol      1   147.40 175.40
## <none>          145.68 175.68
## - fbs       1   148.12 176.12
## - thalach   1   149.24 177.24
## - trestbps  1   150.32 178.32
## - slope     1   153.00 181.00
## - thal      2   155.26 181.26
## - sex       1   153.81 181.81
## - cp        1   155.02 183.02
## - ca        3   173.29 197.29
## 
## Step:  AIC=173.98
## hd ~ sex + cp + trestbps + chol + fbs + thalach + exang + slope + 
##     ca + thal
## 
##            Df Deviance    AIC
## - exang     1   147.50 173.50
## <none>          145.98 173.98
## - chol      1   148.08 174.08
## - fbs       1   148.41 174.41
## - thalach   1   149.60 175.60
## - trestbps  1   151.44 177.44
## - thal      2   155.40 179.40
## - slope     1   153.51 179.51
## - sex       1   154.31 180.31
## - cp        1   155.42 181.42
## - ca        3   173.82 195.82
## 
## Step:  AIC=173.5
## hd ~ sex + cp + trestbps + chol + fbs + thalach + slope + ca + 
##     thal
## 
##            Df Deviance    AIC
## - chol      1   149.40 173.40
## <none>          147.50 173.50
## - fbs       1   149.85 173.85
## - thalach   1   152.33 176.33
## - trestbps  1   153.01 177.01
## - slope     1   155.50 179.50
## - sex       1   155.58 179.58
## - thal      2   157.89 179.89
## - cp        1   159.75 183.75
## - ca        3   175.29 195.29
## 
## Step:  AIC=173.4
## hd ~ sex + cp + trestbps + fbs + thalach + slope + ca + thal
## 
##            Df Deviance    AIC
## <none>          149.40 173.40
## - fbs       1   151.74 173.74
## - thalach   1   153.98 175.98
## - trestbps  1   154.96 176.96
## - sex       1   155.91 177.91
## - slope     1   157.41 179.41
## - thal      2   159.74 179.74
## - cp        1   161.45 183.45
## - ca        3   177.58 195.58

summary(glm_step)

## 
## Call:
## glm(formula = hd ~ sex + cp + trestbps + fbs + thalach + slope + 
##     ca + thal, family = binomial, data = data_train)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.62219    3.03065  -2.185  0.02888 *  
## sex          1.37193    0.55703   2.463  0.01378 *  
## cp           0.74486    0.22618   3.293  0.00099 ***
## trestbps     0.02747    0.01198   2.293  0.02186 *  
## fbs         -0.92664    0.61899  -1.497  0.13439    
## thalach     -0.02324    0.01124  -2.069  0.03858 *  
## slope        1.04821    0.37409   2.802  0.00508 ** 
## ca1          1.72841    0.54314   3.182  0.00146 ** 
## ca2          3.37986    0.80504   4.198 2.69e-05 ***
## ca3          1.12695    0.90086   1.251  0.21095    
## thal6        0.24116    0.85348   0.283  0.77752    
## thal7        1.46410    0.47433   3.087  0.00202 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 286.57  on 206  degrees of freedom
## Residual deviance: 149.40  on 195  degrees of freedom
## AIC: 173.4
## 
## Number of Fisher Scoring iterations: 6

best_glm <- glm_step

# Model fit criteria
AIC(best_glm)

## [1] 173.4021

BIC(best_glm)

## [1] 213.3947

best_glm$deviance

## [1] 149.4021

#4. GLM Evaluation: Train & Test
##Training set
pred_glm_train<- predict(best_glm, type="response")
pred_train <- prediction(pred_glm_train, data_train$hd)
perf_train <- performance(pred_train, "tpr", "fpr")
auc_glm_train <- unlist(slot(performance(pred_train, "auc"), "y.values"))
auc_glm_train

## [1] 0.9157314

# Confusion matrix (training)
pred_glm_train_class <- (pred_glm_train > 0.5)*1
table(data_train$hd, pred_glm_train_class, dnn = c("True", "Pred"))

##     Pred
## True  0  1
##    0 97 11
##    1 18 81

MR_glm_train <- mean(pred_glm_train_class != data_train$hd) * 100
MR_glm_train

## [1] 14.00966

#Testing set
pred_glm_test <- predict(best_glm,newdata = data_test, type="response")
pred_test <- prediction(pred_glm_test, data_test$hd)
perf_test <- performance(pred_test, "tpr", "fpr")
auc_glm_test <- unlist(slot(performance(pred_test, "auc"), "y.values"))
auc_glm_test

## [1] 0.9291498

# Confusion matrix (test)
pred_glm_test_class <- (pred_glm_test > 0.5)*1
table(data_test$hd, pred_glm_test_class, dnn = c("True", "Pred"))

##     Pred
## True  0  1
##    0 46  6
##    1  9 29

MR_glm_test <- mean(pred_glm_test_class != data_test$hd) * 100
MR_glm_test

## [1] 16.66667

#5. Decision Tree Model with Asymmetric cost
tree_rpart <- rpart(formula = hd ~ . -num , 
                    data = data_train, 
                    method = "class", 
                    parms = list(loss=matrix(c(0,1,5,0), nrow = 2)))
prp(tree_rpart, extra = 1, main = "Asymmetric Cost Decision Tree")

# Predict on training set
tree_train.pred <- predict(tree_rpart, data_train, type="class")
table(data_train$hd, tree_train.pred, dnn=c("Truth","Predicted"))

##      Predicted
## Truth   0   1
##     0 107   1
##     1  50  49

MR_tree_train <- mean(tree_train.pred != data_train$hd) * 100
MR_tree_train

## [1] 24.63768

#AUC on training set
tree_train_prob_rpart = predict(tree_rpart, data_train, type="prob")
in.pred = prediction(tree_train_prob_rpart[,2], data_train$hd)
in.perf = performance(in.pred, "tpr", "fpr")
auc_tree_train  <- slot(performance(in.pred, "auc"), "y.values")[[1]]
auc_tree_train

## [1] 0.836373

# Predict on testing set
tree_test.pred <-predict(tree_rpart, data_test, type="class")
table(data_test$hd, tree_test.pred, dnn=c("Truth","Predicted"))

##      Predicted
## Truth  0  1
##     0 49  3
##     1 19 19

MR_tree_test <- mean(tree_test.pred != data_test$hd) * 100
MR_tree_test

## [1] 24.44444

#AUC testing
tree_test_prob_rpart = predict(tree_rpart, data_test, type="prob")
out.pred = prediction(tree_test_prob_rpart[,2], data_test$hd)
out.perf = performance(out.pred, "tpr", "fpr")
auc_tree_test  <- slot(performance(out.pred, "auc"), "y.values")[[1]]
auc_tree_test

## [1] 0.7644231

#6. Random Forest Model
rf_model <- randomForest(as.factor(hd) ~ . - num,
                         data = data_train,
                         ntree = 500,
                         importance = TRUE)

rf_model

## 
## Call:
##  randomForest(formula = as.factor(hd) ~ . - num, data = data_train,      ntree = 500, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 22.22%
## Confusion matrix:
##    0  1 class.error
## 0 85 23   0.2129630
## 1 23 76   0.2323232

importance(rf_model)

##                   0          1 MeanDecreaseAccuracy MeanDecreaseGini
## age       9.3101980  4.8773395           10.4371179       10.9105044
## sex       5.3909008  4.1253916            7.1009187        2.6046158
## cp       11.6194203 13.5854222           17.5240765       12.0072101
## trestbps  4.7421159 -0.6345170            3.0793364        7.6730721
## chol     -0.1620141 -0.8939027           -0.7180373        8.6158551
## fbs      -0.7039048 -3.7918235           -3.2135240        0.9637521
## restecg  -1.8937251  0.7020637           -0.5096327        1.7465248
## thalach   6.0739366  4.9705288            8.3384264       12.1622559
## exang     1.5529589  1.7575048            2.1749294        3.3946755
## oldpeak   2.8281172  7.5051353            7.6249840        8.7421008
## slope     1.0214109 11.5709292            9.9548324        5.5935210
## ca       13.4570932 12.6705281           17.4914990       10.9835439
## thal     18.0677003 16.3767455           22.8287235       16.4018149

# Predict on train set
pred_rf_train_prob <- predict(rf_model, data_train, type = "prob")[, 2]
pred_rf_train_roc  <- prediction(pred_rf_train_prob, data_train$hd)
perf_rf_train      <- performance(pred_rf_train_roc, "tpr", "fpr")
auc_rf_train       <- performance(pred_rf_train_roc, "auc")@y.values[[1]]
auc_rf_train

## [1] 1

# Predict on test set
pred_rf_test_prob <- predict(rf_model, data_test, type = "prob")[, 2]
pred_rf_test_roc  <- prediction(pred_rf_test_prob, data_test$hd)
perf_rf_test      <- performance(pred_rf_test_roc, "tpr", "fpr")
auc_rf_test       <- performance(pred_rf_test_roc, "auc")@y.values[[1]]
auc_rf_test

## [1] 0.9228239

# Confusion matrix (train) for RF
pred_rf_train_class <- ifelse(pred_rf_train_prob > 0.5, 1, 0)
table(Truth = data_train$hd, Predicted = pred_rf_train_class)

##      Predicted
## Truth   0   1
##     0 108   0
##     1   0  99

mr_rf_train <- mean(pred_rf_train_class != data_train$hd) *100
mr_rf_train

## [1] 0

# Confusion matrix (test) for RF
pred_rf_test_class <- ifelse(pred_rf_test_prob > 0.5, 1, 0)
table(Truth = data_test$hd, Predicted = pred_rf_test_class)

##      Predicted
## Truth  0  1
##     0 49  3
##     1  7 31

mr_rf_test <- mean(pred_rf_test_class != data_test$hd) *100
mr_rf_test

## [1] 11.11111

#7. Plot ROC Curves for All Models (Training Set)
plot(perf_train,
     col = "red", lwd = 2,
     main = "ROC Curves (Training Data): GLM vs Tree vs Random Forest")

plot(in.perf,
     col = "blue", lwd = 2,
     add = TRUE)

plot(perf_rf_train,
     col = "green", lwd = 2,
     add = TRUE)

legend("bottomright",
       legend = c(
         paste0("Logistic Regression (AUC = ", round(auc_glm_train, 3), ")"),
         paste0("Decision Tree (AUC = ", round(auc_tree_train, 3), ")"),
         paste0("Random Forest (AUC = ", round(auc_rf_train, 3), ")")
       ),
       col = c("red", "blue", "green"),
       lwd = 2)

#8. Plot ROC Curves for All Models (Testing Set)
plot(perf_test,
     col = "red", lwd = 2,
     main = "ROC Curves (Testing Data): GLM vs Tree vs Random Forest")

plot(out.perf,
     col = "blue", lwd = 2,
     add = TRUE)

plot(perf_rf_test,
     col = "green", lwd = 2,
     add = TRUE)

legend("bottomright",
       legend = c(
         paste0("Logistic Regression (AUC = ", round(auc_glm_test, 3), ")"),
         paste0("Decision Tree (AUC = ", round(auc_tree_test, 3), ")"),
         paste0("Random Forest (AUC = ", round(auc_rf_test, 3), ")")
       ),
       col = c("red", "blue", "green"),
       lwd = 2)

#9) Bar Chart
# Vector of AUC values from your results (testing)
auc_values_test <- c(
  GLM           = auc_glm_test,
  DecisionTree  = auc_tree_test,
  RandomForest  = auc_rf_test)

# Create bar plot
bar_positions_test <- barplot(
  auc_values_test,
  ylim = c(0, 1),
  main = "Model AUC Comparison (Test Set)",
  ylab = "AUC",
  xlab = "Model",
  col  = c("skyblue", "pink", "darkseagreen")
)

# Add numeric labels on top of bars (rounded to 3 decimals)
text(x = bar_positions_test,
     y = auc_values_test,
     labels = round(auc_values_test, 3),
     pos = 3,         # above the bar
     cex = 0.8)

# Vector of AUC values from your results (training)
auc_values_train <- c(
  GLM           = auc_glm_train,
  DecisionTree  = auc_tree_train,
  RandomForest  = auc_rf_train)

# Create bar plot
bar_positions_train <- barplot(
  auc_values_train,
  ylim = c(0, 1),
  main = "Model AUC Comparison (Training Set)",
  ylab = "AUC",
  xlab = "Model",
  col  = c("skyblue", "pink", "darkseagreen")
)

# Add numeric labels on top of bars (rounded to 3 decimals)
text(x = bar_positions_train,
    y = auc_values_train,
    labels = round(auc_values_train, 3),
    pos = 3,         # above the bar
    cex = 0.8)

#10) Bar Chart of Misclassification Rates 

# Vector of misclassification rates from your results (testing)
mr_values_test <- c(
  GLM           = MR_glm_test,
  DecisionTree  = MR_tree_test,
  RandomForest  = mr_rf_test
)

# Create bar plot
bar_positions_testmr <- barplot(
  mr_values_test,
  ylim = c(0, max(mr_values_test) + 5),
  main = "Model Misclassification Rate Comparison (Test Set)",
  ylab = "Misclassification Rate (%)",
  xlab = "Model",
  col  = c("skyblue", "pink", "darkseagreen")
)

# Add numeric labels on top of bars (rounded to 2 decimals)
text(
  x = bar_positions_testmr,
  y = mr_values_test,
  labels = round(mr_values_test, 2),
  pos = 3,
  cex = 0.8)

# Vector of misclassification rates from your results (training)
mr_values_train <- c(
  GLM           = MR_glm_train,
  DecisionTree  = MR_tree_train,
  RandomForest  = mr_rf_train
)

# Create bar plot
bar_positions_trainmr <- barplot(
  mr_values_train,
  ylim = c(0, max(mr_values_train) + 5),
  main = "Model Misclassification Rate Comparison (Train Set)",
  ylab = "Misclassification Rate (%)",
  xlab = "Model",
  col  = c("skyblue", "pink", "darkseagreen")
)

# Add numeric labels on top of bars (rounded to 2 decimals)
text(
  x = bar_positions_trainmr,
  y = mr_values_train,
  labels = round(mr_values_train, 2),
  pos = 3,
  cex = 0.8)

#11) Additional
library(caret)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

## Loading required package: lattice

control <- trainControl(method = "cv", number = 10)
rf_cv <- train(as.factor(hd) ~ . - num, data = data, method = "rf",
               trControl = control, ntree = 500)
rf_cv

## Random Forest 
## 
## 297 samples
##  14 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 267, 267, 267, 267, 267, 268, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.8213793  0.6381593
##    9    0.7772414  0.5502964
##   16    0.7740230  0.5431606
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

#Most importance items
varImp(rf_cv)

## rf variable importance
## 
##          Overall
## thalach  100.000
## cp        93.190
## oldpeak   80.776
## thal7     78.625
## age       66.932
## chol      56.579
## trestbps  52.418
## exang     40.202
## slope     38.378
## sex       25.431
## ca1       21.385
## ca2       18.781
## restecg   10.297
## ca3        7.119
## fbs        2.137
## thal6      0.000

#Confuse maxtrix
pred_cv <- predict(rf_cv, data)
confusionMatrix(pred_cv, as.factor(data$hd))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 160   3
##          1   0 134
##                                           
##                Accuracy : 0.9899          
##                  95% CI : (0.9708, 0.9979)
##     No Information Rate : 0.5387          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9796          
##                                           
##  Mcnemar's Test P-Value : 0.2482          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9781          
##          Pos Pred Value : 0.9816          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.5387          
##          Detection Rate : 0.5387          
##    Detection Prevalence : 0.5488          
##       Balanced Accuracy : 0.9891          
##                                           
##        'Positive' Class : 0               
##

#ROC
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

rf_cv_prob <- predict(rf_cv, data, type = "prob")[,2]
roc_cv <- roc(data$hd, rf_cv_prob)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

auc_cross <- auc(roc_cv)

# Plot ROC curve - Cross Validated
plot(roc_cv, col="darkgreen", lwd=3, main="Cross-Validated ROC Curve")
legend("bottomright",
       legend = paste0("AUC = ", round(auc_cross, 3)),
       col = "darkgreen",
       lwd = 3)

Final.r

phuongnguyen

2025-12-02