options(scipen=999)
setwd("C:\\Users\\user\\Desktop\\R_CODE_2023")
library(rattle)
library(ggplot2)
library(ISLR2)
library(MASS)
library(caret)
library(splines)
library(pROC)
library(rattle)
library(randomForest)

HEART DISEASE DATA

HeartData  = read.csv("Heart.csv", header=TRUE)

#HeartData

View(HeartData)
nrow(HeartData)
[1] 303
HeartData = na.omit(HeartData)
nrow(HeartData)
[1] 297
par(mfrow=c(2,2))
boxplot(HeartData$Age ~ as.factor(HeartData$AHD))
boxplot(HeartData$MaxHR ~ as.factor(HeartData$AHD))
boxplot(HeartData$Chol ~ as.factor(HeartData$AHD))
boxplot(HeartData$Age ~ as.factor(HeartData$AHD))

par(mfrow=c(1,1))
pairs( cbind( HeartData$Chol, HeartData$MaxHR, HeartData$RestBP,HeartData$Age), pch=19, lower.panel=NULL, cex=.5)

looking at classification based on p.hat = .5 cutoff

10-fold CV, repeated 5 times

HeartData$HD = as.factor(HeartData$AHD)
train_model <- trainControl(method = "repeatedcv", number = 5, repeats=10)
MaxHR= HeartData$MaxHR
model.cart <- train(
  HD ~ Age + as.factor(Sex) + as.factor(Thal)
        + Chol + MaxHR + RestBP + Fbs + RestECG + ExAng, 
  data = HeartData, 
  method = "rpart",
  trControl = train_model)

model.cart
CART 

297 samples
  9 predictor
  2 classes: 'No', 'Yes' 

No pre-processing
Resampling: Cross-Validated (5 fold, repeated 10 times) 
Summary of sample sizes: 238, 237, 238, 238, 237, 238, ... 
Resampling results across tuning parameters:

  cp          Accuracy   Kappa    
  0.01094891  0.7271299  0.4498432
  0.02189781  0.7399605  0.4756894
  0.48905109  0.6077401  0.1681395

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.02189781.
model.cart$finalModel
n= 297 

node), split, n, loss, yval, (yprob)
      * denotes terminal node

1) root 297 137 No (0.5387205 0.4612795)  
  2) as.factor(Thal)normal>=0.5 164  37 No (0.7743902 0.2256098) *
  3) as.factor(Thal)normal< 0.5 133  33 Yes (0.2481203 0.7518797) *
confusionMatrix(predict(model.cart, HeartData), 
                reference=HeartData$HD, positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  127  37
       Yes  33 100
                                               
               Accuracy : 0.7643               
                 95% CI : (0.7118, 0.8114)     
    No Information Rate : 0.5387               
    P-Value [Acc > NIR] : 0.0000000000000007203
                                               
                  Kappa : 0.5248               
                                               
 Mcnemar's Test P-Value : 0.7199               
                                               
            Sensitivity : 0.7299               
            Specificity : 0.7937               
         Pos Pred Value : 0.7519               
         Neg Pred Value : 0.7744               
             Prevalence : 0.4613               
         Detection Rate : 0.3367               
   Detection Prevalence : 0.4478               
      Balanced Accuracy : 0.7618               
                                               
       'Positive' Class : Yes                  
                                               

summary(model.cart$finalModel)

fancyRpartPlot(model.cart$finalModel)

model.rf <- train(
  HD ~ Age + as.factor(Sex) + as.factor(Thal)
  + Chol + MaxHR + RestBP + Fbs + RestECG + ExAng,
  data = HeartData, 
  method = "rf",
  trControl = train_model)
model.rf
Random Forest 

297 samples
  9 predictor
  2 classes: 'No', 'Yes' 

No pre-processing
Resampling: Cross-Validated (5 fold, repeated 10 times) 
Summary of sample sizes: 238, 238, 237, 238, 237, 237, ... 
Resampling results across tuning parameters:

  mtry  Accuracy   Kappa    
   2    0.7660734  0.5292625
   6    0.7447853  0.4865593
  10    0.7377119  0.4721681

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.

#HD ~ Chol + MaxHR + RestBP + as.factor(Thal) + ExAng,

summary(model.rf$finalModel)
                Length Class      Mode     
call               4   -none-     call     
type               1   -none-     character
predicted        297   factor     numeric  
err.rate        1500   -none-     numeric  
confusion          6   -none-     numeric  
votes            594   matrix     numeric  
oob.times        297   -none-     numeric  
classes            2   -none-     character
importance        10   -none-     numeric  
importanceSD       0   -none-     NULL     
localImportance    0   -none-     NULL     
proximity          0   -none-     NULL     
ntree              1   -none-     numeric  
mtry               1   -none-     numeric  
forest            14   -none-     list     
y                297   factor     numeric  
test               0   -none-     NULL     
inbag              0   -none-     NULL     
xNames            10   -none-     character
problemType        1   -none-     character
tuneValue          1   data.frame list     
obsLevels          2   -none-     character
param              0   -none-     list     
model.rf$finalModel

Call:
 randomForest(x = x, y = y, mtry = param$mtry) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 21.89%
Confusion matrix:
     No Yes class.error
No  127  33   0.2062500
Yes  32 105   0.2335766
plot(model.rf$finalModel)

varImp(model.rf$finalModel)
                            Overall
Age                       17.653238
as.factor(Sex)1            5.900035
as.factor(Thal)normal     15.913850
as.factor(Thal)reversable 13.057684
Chol                      15.302029
MaxHR                     23.443338
RestBP                    13.918624
Fbs                        2.493889
RestECG                    4.040312
ExAng                     10.959801
plot( varImp(model.rf) )

yhat = predict(model.rf$finalModel, type="prob")[,1]
plot(HeartData$MaxHR, yhat)

scatter.smooth(HeartData$MaxHR, yhat, span=.3) 

confusionMatrix(predict(model.rf, HeartData), 
                reference=HeartData$HD, positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  154   5
       Yes   6 132
                                             
               Accuracy : 0.963              
                 95% CI : (0.9347, 0.9814)   
    No Information Rate : 0.5387             
    P-Value [Acc > NIR] : <0.0000000000000002
                                             
                  Kappa : 0.9255             
                                             
 Mcnemar's Test P-Value : 1                  
                                             
            Sensitivity : 0.9635             
            Specificity : 0.9625             
         Pos Pred Value : 0.9565             
         Neg Pred Value : 0.9686             
             Prevalence : 0.4613             
         Detection Rate : 0.4444             
   Detection Prevalence : 0.4646             
      Balanced Accuracy : 0.9630             
                                             
       'Positive' Class : Yes                
                                             

#cart model for under 50 vs over 50

#train on everyone

model.cartl50 <- train(
  HD ~ Age + as.factor(Sex) + as.factor(Thal)
  + Chol + MaxHR + RestBP + Fbs + RestECG + ExAng, 
  data = HeartData, 
  method = "rpart",
  trControl = train_model)

Below we will examine the performance on the two age groups. Is there a disparity? In what cases does this matter?

#predict on under 50

confusionMatrix(predict(model.cartl50, HeartData[HeartData$Age<50,]), 
                reference=HeartData[HeartData$Age<50,]$HD, positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction No Yes
       No  54   6
       Yes  6  19
                                          
               Accuracy : 0.8588          
                 95% CI : (0.7664, 0.9249)
    No Information Rate : 0.7059          
    P-Value [Acc > NIR] : 0.0007925       
                                          
                  Kappa : 0.66            
                                          
 Mcnemar's Test P-Value : 1.0000000       
                                          
            Sensitivity : 0.7600          
            Specificity : 0.9000          
         Pos Pred Value : 0.7600          
         Neg Pred Value : 0.9000          
             Prevalence : 0.2941          
         Detection Rate : 0.2235          
   Detection Prevalence : 0.2941          
      Balanced Accuracy : 0.8300          
                                          
       'Positive' Class : Yes             
                                          

#predict on over 50

confusionMatrix(predict(model.cartl50, HeartData[HeartData$Age>=50,]), 
                reference=HeartData[HeartData$Age>=50,]$HD, positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction No Yes
       No  73  31
       Yes 27  81
                                          
               Accuracy : 0.7264          
                 95% CI : (0.6611, 0.7852)
    No Information Rate : 0.5283          
    P-Value [Acc > NIR] : 0.000000002763  
                                          
                  Kappa : 0.4522          
                                          
 Mcnemar's Test P-Value : 0.6936          
                                          
            Sensitivity : 0.7232          
            Specificity : 0.7300          
         Pos Pred Value : 0.7500          
         Neg Pred Value : 0.7019          
             Prevalence : 0.5283          
         Detection Rate : 0.3821          
   Detection Prevalence : 0.5094          
      Balanced Accuracy : 0.7266          
                                          
       'Positive' Class : Yes             
                                          

To unpack why there is a difference, let’s look at variable distribution differences by age

AHD=HeartData$AHD
Chol= HeartData$Chol
par(mfrow=c(2,2))
boxplot(MaxHR[HeartData$Age<50] ~ as.factor(AHD[HeartData$Age<50]),ylim=c(80,200))
boxplot(MaxHR[HeartData$Age>=50] ~ as.factor(AHD[HeartData$Age>=50]),ylim=c(80,200))
boxplot(Chol[HeartData$Age<50] ~ as.factor(AHD[HeartData$Age<50]),ylim=c(100,400))
boxplot(Chol[HeartData$Age>=50] ~ as.factor(AHD[HeartData$Age>=50]),ylim=c(100,400))

#categorical variable distribution differences

Thal= HeartData$Thal
par(mfrow=c(1,2))
plot(as.factor(Thal[HeartData$Age<50]), xlab="Less than 50", ylab="Count")
plot(as.factor(Thal[HeartData$Age>=50]), xlab="50 and older", ylab="Count")