options(scipen=999)
setwd("C:\\Users\\user\\Desktop\\R_CODE_2023")
library(rattle)
library(ggplot2)
library(ISLR2)
library(MASS)
library(caret)
library(splines)
library(pROC)
library(rattle)
library(randomForest)
HeartData = read.csv("Heart.csv", header=TRUE)
#HeartData
View(HeartData)
nrow(HeartData)
[1] 303
HeartData = na.omit(HeartData)
nrow(HeartData)
[1] 297
par(mfrow=c(2,2))
boxplot(HeartData$Age ~ as.factor(HeartData$AHD))
boxplot(HeartData$MaxHR ~ as.factor(HeartData$AHD))
boxplot(HeartData$Chol ~ as.factor(HeartData$AHD))
boxplot(HeartData$Age ~ as.factor(HeartData$AHD))
par(mfrow=c(1,1))
pairs( cbind( HeartData$Chol, HeartData$MaxHR, HeartData$RestBP,HeartData$Age), pch=19, lower.panel=NULL, cex=.5)
HeartData$HD = as.factor(HeartData$AHD)
train_model <- trainControl(method = "repeatedcv", number = 5, repeats=10)
model.cart <- train(
HD ~ Age + as.factor(Sex) + as.factor(Thal)
+ Chol + MaxHR + RestBP + Fbs + RestECG + ExAng,
data = HeartData,
method = "rpart",
trControl = train_model)
model.cart
CART
297 samples
9 predictor
2 classes: 'No', 'Yes'
No pre-processing
Resampling: Cross-Validated (5 fold, repeated 10 times)
Summary of sample sizes: 238, 237, 237, 238, 238, 237, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.01094891 0.7289492 0.4537329
0.02189781 0.7404237 0.4774736
0.48905109 0.6519944 0.2697103
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.02189781.
model.cart$finalModel
n= 297
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 297 137 No (0.5387205 0.4612795)
2) as.factor(Thal)normal>=0.5 164 37 No (0.7743902 0.2256098) *
3) as.factor(Thal)normal< 0.5 133 33 Yes (0.2481203 0.7518797) *
confusionMatrix(predict(model.cart, HeartData),
reference=HeartData$HD, positive="Yes")
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 127 37
Yes 33 100
Accuracy : 0.7643
95% CI : (0.7118, 0.8114)
No Information Rate : 0.5387
P-Value [Acc > NIR] : 0.0000000000000007203
Kappa : 0.5248
Mcnemar's Test P-Value : 0.7199
Sensitivity : 0.7299
Specificity : 0.7937
Pos Pred Value : 0.7519
Neg Pred Value : 0.7744
Prevalence : 0.4613
Detection Rate : 0.3367
Detection Prevalence : 0.4478
Balanced Accuracy : 0.7618
'Positive' Class : Yes
fancyRpartPlot(model.cart$finalModel)
model.rf <- train(
HD ~ Age + as.factor(Sex) + as.factor(Thal)
+ Chol + MaxHR + RestBP + Fbs + RestECG + ExAng,
data = HeartData,
method = "rf",
trControl = train_model)
model.rf
Random Forest
297 samples
9 predictor
2 classes: 'No', 'Yes'
No pre-processing
Resampling: Cross-Validated (5 fold, repeated 10 times)
Summary of sample sizes: 238, 238, 238, 237, 237, 238, ...
Resampling results across tuning parameters:
mtry Accuracy Kappa
2 0.7645932 0.5262468
6 0.7337345 0.4640833
10 0.7256158 0.4474989
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.
#HD ~ Chol + MaxHR + RestBP + as.factor(Thal) + ExAng,
summary(model.rf$finalModel)
Length Class Mode
call 4 -none- call
type 1 -none- character
predicted 297 factor numeric
err.rate 1500 -none- numeric
confusion 6 -none- numeric
votes 594 matrix numeric
oob.times 297 -none- numeric
classes 2 -none- character
importance 10 -none- numeric
importanceSD 0 -none- NULL
localImportance 0 -none- NULL
proximity 0 -none- NULL
ntree 1 -none- numeric
mtry 1 -none- numeric
forest 14 -none- list
y 297 factor numeric
test 0 -none- NULL
inbag 0 -none- NULL
xNames 10 -none- character
problemType 1 -none- character
tuneValue 1 data.frame list
obsLevels 2 -none- character
param 0 -none- list
model.rf$finalModel
Call:
randomForest(x = x, y = y, mtry = param$mtry)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 23.57%
Confusion matrix:
No Yes class.error
No 126 34 0.2125000
Yes 36 101 0.2627737
plot(model.rf$finalModel)
varImp(model.rf$finalModel)
Overall
Age 17.500869
as.factor(Sex)1 5.507407
as.factor(Thal)normal 15.580738
as.factor(Thal)reversable 13.890848
Chol 15.524651
MaxHR 24.629367
RestBP 14.526597
Fbs 2.268989
RestECG 3.961817
ExAng 10.524847
plot( varImp(model.rf) )
yhat = predict(model.rf$finalModel, type="prob")[,1]
plot(HeartData$MaxHR, yhat)
scatter.smooth(HeartData$MaxHR, yhat, span=.3)
confusionMatrix(predict(model.rf, HeartData),
reference=HeartData$HD, positive="Yes")
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 156 7
Yes 4 130
Accuracy : 0.963
95% CI : (0.9347, 0.9814)
No Information Rate : 0.5387
P-Value [Acc > NIR] : <0.0000000000000002
Kappa : 0.9254
Mcnemar's Test P-Value : 0.5465
Sensitivity : 0.9489
Specificity : 0.9750
Pos Pred Value : 0.9701
Neg Pred Value : 0.9571
Prevalence : 0.4613
Detection Rate : 0.4377
Detection Prevalence : 0.4512
Balanced Accuracy : 0.9620
'Positive' Class : Yes