options(scipen=999)

library(dplyr)

setwd("C:/Users/user/Desktop/noble")
HeartData=read.csv("Heart (1).csv", header=TRUE)
attach(HeartData)
count(HeartData)

    n
1 303

head(HeartData,5)

  X Age Sex    ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca
1 1  63   1      typical    145  233   1       2   150     0     2.3     3  0
2 2  67   1 asymptomatic    160  286   0       2   108     1     1.5     2  3
3 3  67   1 asymptomatic    120  229   0       2   129     1     2.6     2  2
4 4  37   1   nonanginal    130  250   0       0   187     0     3.5     3  0
5 5  41   0   nontypical    130  204   0       2   172     0     1.4     1  0
        Thal AHD
1      fixed  No
2     normal Yes
3 reversable Yes
4     normal  No
5     normal  No

library(ISLR2)
library(MASS)
library(splines)
library(pROC)
library(rattle)
library(caret)

attach(HeartData)
par(mfrow=c(2,2))
boxplot(Age ~ as.factor(AHD))
boxplot(MaxHR ~ as.factor(AHD))
boxplot(Chol ~ as.factor(AHD))
boxplot(Age ~ as.factor(AHD))

par(mfrow=c(1,1))

pairs( cbind( Chol, MaxHR, RestBP,Age), pch=19, lower.panel=NULL, cex=.5)

HeartData$HD = as.factor(AHD)

looking at classification based on p.hat = .5 cutoff

10-fold CV, repeated 5 times

train_model <- trainControl(method = "repeatedcv", number = 5, repeats=10)

HeartData = na.omit(HeartData)
model.cart <- train(
  HD ~ Age + as.factor(Sex) + as.factor(Thal)
        + Chol + MaxHR + RestBP + Fbs + RestECG + ExAng, 
  data = HeartData, 
  method = "rpart",
  trControl = train_model)

model.cart

CART 

297 samples
  9 predictor
  2 classes: 'No', 'Yes' 

No pre-processing
Resampling: Cross-Validated (5 fold, repeated 10 times) 
Summary of sample sizes: 237, 238, 238, 237, 238, 237, ... 
Resampling results across tuning parameters:

  cp          Accuracy   Kappa    
  0.01094891  0.7360734  0.4676307
  0.02189781  0.7404520  0.4775511
  0.48905109  0.6205932  0.1972206

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.02189781.

model.cart$finalModel

n= 297 

node), split, n, loss, yval, (yprob)
      * denotes terminal node

1) root 297 137 No (0.5387205 0.4612795)  
  2) as.factor(Thal)normal>=0.5 164  37 No (0.7743902 0.2256098) *
  3) as.factor(Thal)normal< 0.5 133  33 Yes (0.2481203 0.7518797) *

confusionMatrix(predict(model.cart, HeartData), 
                reference=HeartData$HD, positive="Yes")

Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  127  37
       Yes  33 100
                                               
               Accuracy : 0.7643               
                 95% CI : (0.7118, 0.8114)     
    No Information Rate : 0.5387               
    P-Value [Acc > NIR] : 0.0000000000000007203
                                               
                  Kappa : 0.5248               
                                               
 Mcnemar's Test P-Value : 0.7199               
                                               
            Sensitivity : 0.7299               
            Specificity : 0.7937               
         Pos Pred Value : 0.7519               
         Neg Pred Value : 0.7744               
             Prevalence : 0.4613               
         Detection Rate : 0.3367               
   Detection Prevalence : 0.4478               
      Balanced Accuracy : 0.7618               
                                               
       'Positive' Class : Yes

summary(model.cart$finalModel)

library(rattle)
fancyRpartPlot(model.cart$finalModel)

model.rf <- train(
  HD ~ Age + as.factor(Sex) + as.factor(Thal)
  + Chol + MaxHR + RestBP + Fbs + RestECG + ExAng,
  data = HeartData, 
  method = "rf",
  trControl = train_model)
model.rf

Random Forest 

297 samples
  9 predictor
  2 classes: 'No', 'Yes' 

No pre-processing
Resampling: Cross-Validated (5 fold, repeated 10 times) 
Summary of sample sizes: 237, 238, 237, 238, 238, 237, ... 
Resampling results across tuning parameters:

  mtry  Accuracy   Kappa    
   2    0.7694011  0.5357627
   6    0.7404576  0.4773831
  10    0.7347232  0.4663167

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.

#######HD ~ Chol + MaxHR + RestBP + as.factor(Thal) + ExAng,

summary(model.rf$finalModel)

                Length Class      Mode     
call               4   -none-     call     
type               1   -none-     character
predicted        297   factor     numeric  
err.rate        1500   -none-     numeric  
confusion          6   -none-     numeric  
votes            594   matrix     numeric  
oob.times        297   -none-     numeric  
classes            2   -none-     character
importance        10   -none-     numeric  
importanceSD       0   -none-     NULL     
localImportance    0   -none-     NULL     
proximity          0   -none-     NULL     
ntree              1   -none-     numeric  
mtry               1   -none-     numeric  
forest            14   -none-     list     
y                297   factor     numeric  
test               0   -none-     NULL     
inbag              0   -none-     NULL     
xNames            10   -none-     character
problemType        1   -none-     character
tuneValue          1   data.frame list     
obsLevels          2   -none-     character
param              0   -none-     list

model.rf$finalModel


Call:
 randomForest(x = x, y = y, mtry = param$mtry) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 24.24%
Confusion matrix:
     No Yes class.error
No  126  34   0.2125000
Yes  38  99   0.2773723

plot(model.rf$finalModel)

varImp(model.rf$finalModel)

                            Overall
Age                       18.008814
as.factor(Sex)1            5.823228
as.factor(Thal)normal     17.089918
as.factor(Thal)reversable 12.876562
Chol                      14.827579
MaxHR                     22.645941
RestBP                    13.962065
Fbs                        2.411935
RestECG                    4.005220
ExAng                     10.669930

plot( varImp(model.rf) )

yhat = predict(model.rf$finalModel, type="prob")[,1]
plot(HeartData$MaxHR, yhat)

scatter.smooth(HeartData$MaxHR, yhat, span=.3)

scatter.smooth(HeartData$Chol, yhat, span=.3)

##DAY 3

confusionMatrix(predict(model.rf, HeartData), 
                reference=HeartData$HD, positive="Yes")

Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  158   6
       Yes   2 131
                                             
               Accuracy : 0.9731             
                 95% CI : (0.9476, 0.9883)   
    No Information Rate : 0.5387             
    P-Value [Acc > NIR] : <0.0000000000000002
                                             
                  Kappa : 0.9457             
                                             
 Mcnemar's Test P-Value : 0.2888             
                                             
            Sensitivity : 0.9562             
            Specificity : 0.9875             
         Pos Pred Value : 0.9850             
         Neg Pred Value : 0.9634             
             Prevalence : 0.4613             
         Detection Rate : 0.4411             
   Detection Prevalence : 0.4478             
      Balanced Accuracy : 0.9719             
                                             
       'Positive' Class : Yes

######train on everyone

model.cartl50 <- train(
  HD ~ Age + as.factor(Sex) + as.factor(Thal)
  + Chol + MaxHR + RestBP + Fbs + RestECG + ExAng, 
  data = HeartData, 
  method = "rpart",
  trControl = train_model)

Below we will examine the performance on the two age groups. Is there a disparity? In what cases does this matter?

#predict on under 50

confusionMatrix(predict(model.cartl50, HeartData[HeartData$Age<50,]), 
                reference=HeartData[HeartData$Age<50,]$HD, positive="Yes")

Confusion Matrix and Statistics

          Reference
Prediction No Yes
       No  54   6
       Yes  6  19
                                          
               Accuracy : 0.8588          
                 95% CI : (0.7664, 0.9249)
    No Information Rate : 0.7059          
    P-Value [Acc > NIR] : 0.0007925       
                                          
                  Kappa : 0.66            
                                          
 Mcnemar's Test P-Value : 1.0000000       
                                          
            Sensitivity : 0.7600          
            Specificity : 0.9000          
         Pos Pred Value : 0.7600          
         Neg Pred Value : 0.9000          
             Prevalence : 0.2941          
         Detection Rate : 0.2235          
   Detection Prevalence : 0.2941          
      Balanced Accuracy : 0.8300          
                                          
       'Positive' Class : Yes

########predict on over 50

confusionMatrix(predict(model.cartl50, HeartData[HeartData$Age>=50,]), 
                reference=HeartData[HeartData$Age>=50,]$HD, positive="Yes")

Confusion Matrix and Statistics

          Reference
Prediction No Yes
       No  73  31
       Yes 27  81
                                          
               Accuracy : 0.7264          
                 95% CI : (0.6611, 0.7852)
    No Information Rate : 0.5283          
    P-Value [Acc > NIR] : 0.000000002763  
                                          
                  Kappa : 0.4522          
                                          
 Mcnemar's Test P-Value : 0.6936          
                                          
            Sensitivity : 0.7232          
            Specificity : 0.7300          
         Pos Pred Value : 0.7500          
         Neg Pred Value : 0.7019          
             Prevalence : 0.5283          
         Detection Rate : 0.3821          
   Detection Prevalence : 0.5094          
      Balanced Accuracy : 0.7266          
                                          
       'Positive' Class : Yes

To unpack why there is a difference, let’s look at variable distribution differences by age

par(mfrow=c(2,2))
boxplot(MaxHR[HeartData$Age<50] ~ as.factor(AHD[HeartData$Age<50]),ylim=c(80,200))
boxplot(MaxHR[HeartData$Age>=50] ~ as.factor(AHD[HeartData$Age>=50]),ylim=c(80,200))
boxplot(Chol[HeartData$Age<50] ~ as.factor(AHD[HeartData$Age<50]),ylim=c(100,400))
boxplot(Chol[HeartData$Age>=50] ~ as.factor(AHD[HeartData$Age>=50]),ylim=c(100,400))

######categorical variable distribution differences

par(mfrow=c(1,2))
plot(as.factor(Thal[HeartData$Age<50]), xlab="Less than 50", ylab="Count")
plot(as.factor(Thal[HeartData$Age>=50]), xlab="50 and older", ylab="Count")

NOBLE DAY 2

Waka Olivia

2023-07-04

10-fold CV, repeated 5 times

summary(model.cart$finalModel)

Below we will examine the performance on the two age groups. Is there a disparity? In what cases does this matter?

To unpack why there is a difference, let’s look at variable distribution differences by age