options(scipen=999)
setwd("C:\\Users\\user\\Desktop\\R_CODE_2023")
library(rattle)
library(ggplot2)
library(ISLR2)
library(MASS)
library(caret)
library(splines)
library(pROC)
library(rattle)
library(randomForest)

HEART DISEASE DATA

HeartData  = read.csv("Heart.csv", header=TRUE)

#HeartData

View(HeartData)
nrow(HeartData)
[1] 303
HeartData = na.omit(HeartData)
nrow(HeartData)
[1] 297
par(mfrow=c(2,2))
boxplot(HeartData$Age ~ as.factor(HeartData$AHD))
boxplot(HeartData$MaxHR ~ as.factor(HeartData$AHD))
boxplot(HeartData$Chol ~ as.factor(HeartData$AHD))
boxplot(HeartData$Age ~ as.factor(HeartData$AHD))

par(mfrow=c(1,1))
pairs( cbind( HeartData$Chol, HeartData$MaxHR, HeartData$RestBP,HeartData$Age), pch=19, lower.panel=NULL, cex=.5)

looking at classification based on p.hat = .5 cutoff

10-fold CV, repeated 5 times

HeartData$HD = as.factor(HeartData$AHD)
train_model <- trainControl(method = "repeatedcv", number = 5, repeats=10)
model.cart <- train(
  HD ~ Age + as.factor(Sex) + as.factor(Thal)
        + Chol + MaxHR + RestBP + Fbs + RestECG + ExAng, 
  data = HeartData, 
  method = "rpart",
  trControl = train_model)

model.cart
CART 

297 samples
  9 predictor
  2 classes: 'No', 'Yes' 

No pre-processing
Resampling: Cross-Validated (5 fold, repeated 10 times) 
Summary of sample sizes: 238, 237, 237, 238, 238, 237, ... 
Resampling results across tuning parameters:

  cp          Accuracy   Kappa    
  0.01094891  0.7289492  0.4537329
  0.02189781  0.7404237  0.4774736
  0.48905109  0.6519944  0.2697103

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.02189781.
model.cart$finalModel
n= 297 

node), split, n, loss, yval, (yprob)
      * denotes terminal node

1) root 297 137 No (0.5387205 0.4612795)  
  2) as.factor(Thal)normal>=0.5 164  37 No (0.7743902 0.2256098) *
  3) as.factor(Thal)normal< 0.5 133  33 Yes (0.2481203 0.7518797) *
confusionMatrix(predict(model.cart, HeartData), 
                reference=HeartData$HD, positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  127  37
       Yes  33 100
                                               
               Accuracy : 0.7643               
                 95% CI : (0.7118, 0.8114)     
    No Information Rate : 0.5387               
    P-Value [Acc > NIR] : 0.0000000000000007203
                                               
                  Kappa : 0.5248               
                                               
 Mcnemar's Test P-Value : 0.7199               
                                               
            Sensitivity : 0.7299               
            Specificity : 0.7937               
         Pos Pred Value : 0.7519               
         Neg Pred Value : 0.7744               
             Prevalence : 0.4613               
         Detection Rate : 0.3367               
   Detection Prevalence : 0.4478               
      Balanced Accuracy : 0.7618               
                                               
       'Positive' Class : Yes                  
                                               

summary(model.cart$finalModel)

fancyRpartPlot(model.cart$finalModel)

model.rf <- train(
  HD ~ Age + as.factor(Sex) + as.factor(Thal)
  + Chol + MaxHR + RestBP + Fbs + RestECG + ExAng,
  data = HeartData, 
  method = "rf",
  trControl = train_model)
model.rf
Random Forest 

297 samples
  9 predictor
  2 classes: 'No', 'Yes' 

No pre-processing
Resampling: Cross-Validated (5 fold, repeated 10 times) 
Summary of sample sizes: 238, 238, 238, 237, 237, 238, ... 
Resampling results across tuning parameters:

  mtry  Accuracy   Kappa    
   2    0.7645932  0.5262468
   6    0.7337345  0.4640833
  10    0.7256158  0.4474989

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.

#HD ~ Chol + MaxHR + RestBP + as.factor(Thal) + ExAng,

summary(model.rf$finalModel)
                Length Class      Mode     
call               4   -none-     call     
type               1   -none-     character
predicted        297   factor     numeric  
err.rate        1500   -none-     numeric  
confusion          6   -none-     numeric  
votes            594   matrix     numeric  
oob.times        297   -none-     numeric  
classes            2   -none-     character
importance        10   -none-     numeric  
importanceSD       0   -none-     NULL     
localImportance    0   -none-     NULL     
proximity          0   -none-     NULL     
ntree              1   -none-     numeric  
mtry               1   -none-     numeric  
forest            14   -none-     list     
y                297   factor     numeric  
test               0   -none-     NULL     
inbag              0   -none-     NULL     
xNames            10   -none-     character
problemType        1   -none-     character
tuneValue          1   data.frame list     
obsLevels          2   -none-     character
param              0   -none-     list     
model.rf$finalModel

Call:
 randomForest(x = x, y = y, mtry = param$mtry) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 23.57%
Confusion matrix:
     No Yes class.error
No  126  34   0.2125000
Yes  36 101   0.2627737
plot(model.rf$finalModel)

varImp(model.rf$finalModel)
                            Overall
Age                       17.500869
as.factor(Sex)1            5.507407
as.factor(Thal)normal     15.580738
as.factor(Thal)reversable 13.890848
Chol                      15.524651
MaxHR                     24.629367
RestBP                    14.526597
Fbs                        2.268989
RestECG                    3.961817
ExAng                     10.524847
plot( varImp(model.rf) )

yhat = predict(model.rf$finalModel, type="prob")[,1]
plot(HeartData$MaxHR, yhat)

scatter.smooth(HeartData$MaxHR, yhat, span=.3) 

confusionMatrix(predict(model.rf, HeartData), 
                reference=HeartData$HD, positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  156   7
       Yes   4 130
                                             
               Accuracy : 0.963              
                 95% CI : (0.9347, 0.9814)   
    No Information Rate : 0.5387             
    P-Value [Acc > NIR] : <0.0000000000000002
                                             
                  Kappa : 0.9254             
                                             
 Mcnemar's Test P-Value : 0.5465             
                                             
            Sensitivity : 0.9489             
            Specificity : 0.9750             
         Pos Pred Value : 0.9701             
         Neg Pred Value : 0.9571             
             Prevalence : 0.4613             
         Detection Rate : 0.4377             
   Detection Prevalence : 0.4512             
      Balanced Accuracy : 0.9620             
                                             
       'Positive' Class : Yes