setwd("C:\\Users\\user\\Desktop\\R_CODE_2023")
library(rattle)
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(ggplot2)
library(ISLR2)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
## 
##     Boston
library(caret)
## Loading required package: lattice
library(splines)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(rattle)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:rattle':
## 
##     importance

HEART DISEASE DATA

HeartData  = read.csv("Heart.csv", header=TRUE)

#HeartData

View(HeartData)
nrow(HeartData)
## [1] 303
HeartData = na.omit(HeartData)
nrow(HeartData)
## [1] 297
par(mfrow=c(2,2))
boxplot(HeartData$Age ~ as.factor(HeartData$AHD))
boxplot(HeartData$MaxHR ~ as.factor(HeartData$AHD))
boxplot(HeartData$Chol ~ as.factor(HeartData$AHD))
boxplot(HeartData$Age ~ as.factor(HeartData$AHD))

par(mfrow=c(1,1))
pairs( cbind( HeartData$Chol, HeartData$MaxHR, HeartData$RestBP,HeartData$Age), pch=19, lower.panel=NULL, cex=.5)

looking at classification based on p.hat = .5 cutoff

10-fold CV, repeated 5 times

HeartData$HD = as.factor(HeartData$AHD)
train_model <- trainControl(method = "repeatedcv", number = 5, repeats=10)
model.cart <- train(
  HD ~ Age + as.factor(Sex) + as.factor(Thal)
        + Chol + MaxHR + RestBP + Fbs + RestECG + ExAng, 
  data = HeartData, 
  method = "rpart",
  trControl = train_model)

model.cart
## CART 
## 
## 297 samples
##   9 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 10 times) 
## Summary of sample sizes: 237, 237, 238, 238, 238, 238, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.01094891  0.7307288  0.4574042
##   0.02189781  0.7377966  0.4722918
##   0.48905109  0.6101299  0.1717318
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.02189781.
model.cart$finalModel
## n= 297 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 297 137 No (0.5387205 0.4612795)  
##   2) as.factor(Thal)normal>=0.5 164  37 No (0.7743902 0.2256098) *
##   3) as.factor(Thal)normal< 0.5 133  33 Yes (0.2481203 0.7518797) *
confusionMatrix(predict(model.cart, HeartData), 
                reference=HeartData$HD, positive="Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  127  37
##        Yes  33 100
##                                           
##                Accuracy : 0.7643          
##                  95% CI : (0.7118, 0.8114)
##     No Information Rate : 0.5387          
##     P-Value [Acc > NIR] : 7.203e-16       
##                                           
##                   Kappa : 0.5248          
##                                           
##  Mcnemar's Test P-Value : 0.7199          
##                                           
##             Sensitivity : 0.7299          
##             Specificity : 0.7937          
##          Pos Pred Value : 0.7519          
##          Neg Pred Value : 0.7744          
##              Prevalence : 0.4613          
##          Detection Rate : 0.3367          
##    Detection Prevalence : 0.4478          
##       Balanced Accuracy : 0.7618          
##                                           
##        'Positive' Class : Yes             
## 

summary(model.cart$finalModel)

fancyRpartPlot(model.cart$finalModel)

model.rf <- train(
  HD ~ Age + as.factor(Sex) + as.factor(Thal)
  + Chol + MaxHR + RestBP + Fbs + RestECG + ExAng,
  data = HeartData, 
  method = "rf",
  trControl = train_model)
model.rf
## Random Forest 
## 
## 297 samples
##   9 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 10 times) 
## Summary of sample sizes: 238, 238, 237, 238, 237, 237, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.7654124  0.5277834
##    6    0.7404972  0.4776255
##   10    0.7313785  0.4594746
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

#HD ~ Chol + MaxHR + RestBP + as.factor(Thal) + ExAng,

summary(model.rf$finalModel)
##                 Length Class      Mode     
## call               4   -none-     call     
## type               1   -none-     character
## predicted        297   factor     numeric  
## err.rate        1500   -none-     numeric  
## confusion          6   -none-     numeric  
## votes            594   matrix     numeric  
## oob.times        297   -none-     numeric  
## classes            2   -none-     character
## importance        10   -none-     numeric  
## importanceSD       0   -none-     NULL     
## localImportance    0   -none-     NULL     
## proximity          0   -none-     NULL     
## ntree              1   -none-     numeric  
## mtry               1   -none-     numeric  
## forest            14   -none-     list     
## y                297   factor     numeric  
## test               0   -none-     NULL     
## inbag              0   -none-     NULL     
## xNames            10   -none-     character
## problemType        1   -none-     character
## tuneValue          1   data.frame list     
## obsLevels          2   -none-     character
## param              0   -none-     list
model.rf$finalModel
## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 24.24%
## Confusion matrix:
##      No Yes class.error
## No  126  34   0.2125000
## Yes  38  99   0.2773723
plot(model.rf$finalModel)

varImp(model.rf$finalModel)
##                             Overall
## Age                       17.712745
## as.factor(Sex)1            5.550317
## as.factor(Thal)normal     16.729774
## as.factor(Thal)reversable 13.550742
## Chol                      15.367344
## MaxHR                     23.847994
## RestBP                    13.858147
## Fbs                        2.341168
## RestECG                    3.935611
## ExAng                      9.823690
plot( varImp(model.rf) )

yhat = predict(model.rf$finalModel, type="prob")[,1]
plot(HeartData$MaxHR, yhat)

scatter.smooth(HeartData$MaxHR, yhat, span=.3) 

confusionMatrix(predict(model.rf, HeartData), 
                reference=HeartData$HD, positive="Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  157   6
##        Yes   3 131
##                                           
##                Accuracy : 0.9697          
##                  95% CI : (0.9433, 0.9861)
##     No Information Rate : 0.5387          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9389          
##                                           
##  Mcnemar's Test P-Value : 0.505           
##                                           
##             Sensitivity : 0.9562          
##             Specificity : 0.9812          
##          Pos Pred Value : 0.9776          
##          Neg Pred Value : 0.9632          
##              Prevalence : 0.4613          
##          Detection Rate : 0.4411          
##    Detection Prevalence : 0.4512          
##       Balanced Accuracy : 0.9687          
##                                           
##        'Positive' Class : Yes             
##