options(scipen=999)
setwd("C:\\Users\\user\\Desktop\\R_CODE_2023")
library(rattle)
library(ggplot2)
library(ISLR2)
library(MASS)
library(caret)
library(splines)
library(pROC)
library(rattle)
library(randomForest)
HeartData = read.csv("Heart.csv", header=TRUE)
#HeartData
View(HeartData)
nrow(HeartData)
[1] 303
HeartData = na.omit(HeartData)
nrow(HeartData)
[1] 297
par(mfrow=c(2,2))
boxplot(HeartData$Age ~ as.factor(HeartData$AHD))
boxplot(HeartData$MaxHR ~ as.factor(HeartData$AHD))
boxplot(HeartData$Chol ~ as.factor(HeartData$AHD))
boxplot(HeartData$Age ~ as.factor(HeartData$AHD))
par(mfrow=c(1,1))
pairs( cbind( HeartData$Chol, HeartData$MaxHR, HeartData$RestBP,HeartData$Age), pch=19, lower.panel=NULL, cex=.5)
HeartData$HD = as.factor(HeartData$AHD)
train_model <- trainControl(method = "repeatedcv", number = 5, repeats=10)
MaxHR= HeartData$MaxHR
model.cart <- train(
HD ~ Age + as.factor(Sex) + as.factor(Thal)
+ Chol + MaxHR + RestBP + Fbs + RestECG + ExAng,
data = HeartData,
method = "rpart",
trControl = train_model)
model.cart
CART
297 samples
9 predictor
2 classes: 'No', 'Yes'
No pre-processing
Resampling: Cross-Validated (5 fold, repeated 10 times)
Summary of sample sizes: 238, 237, 238, 238, 237, 238, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.01094891 0.7271299 0.4498432
0.02189781 0.7399605 0.4756894
0.48905109 0.6077401 0.1681395
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.02189781.
model.cart$finalModel
n= 297
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 297 137 No (0.5387205 0.4612795)
2) as.factor(Thal)normal>=0.5 164 37 No (0.7743902 0.2256098) *
3) as.factor(Thal)normal< 0.5 133 33 Yes (0.2481203 0.7518797) *
confusionMatrix(predict(model.cart, HeartData),
reference=HeartData$HD, positive="Yes")
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 127 37
Yes 33 100
Accuracy : 0.7643
95% CI : (0.7118, 0.8114)
No Information Rate : 0.5387
P-Value [Acc > NIR] : 0.0000000000000007203
Kappa : 0.5248
Mcnemar's Test P-Value : 0.7199
Sensitivity : 0.7299
Specificity : 0.7937
Pos Pred Value : 0.7519
Neg Pred Value : 0.7744
Prevalence : 0.4613
Detection Rate : 0.3367
Detection Prevalence : 0.4478
Balanced Accuracy : 0.7618
'Positive' Class : Yes
fancyRpartPlot(model.cart$finalModel)
model.rf <- train(
HD ~ Age + as.factor(Sex) + as.factor(Thal)
+ Chol + MaxHR + RestBP + Fbs + RestECG + ExAng,
data = HeartData,
method = "rf",
trControl = train_model)
model.rf
Random Forest
297 samples
9 predictor
2 classes: 'No', 'Yes'
No pre-processing
Resampling: Cross-Validated (5 fold, repeated 10 times)
Summary of sample sizes: 238, 238, 237, 238, 237, 237, ...
Resampling results across tuning parameters:
mtry Accuracy Kappa
2 0.7660734 0.5292625
6 0.7447853 0.4865593
10 0.7377119 0.4721681
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.
#HD ~ Chol + MaxHR + RestBP + as.factor(Thal) + ExAng,
summary(model.rf$finalModel)
Length Class Mode
call 4 -none- call
type 1 -none- character
predicted 297 factor numeric
err.rate 1500 -none- numeric
confusion 6 -none- numeric
votes 594 matrix numeric
oob.times 297 -none- numeric
classes 2 -none- character
importance 10 -none- numeric
importanceSD 0 -none- NULL
localImportance 0 -none- NULL
proximity 0 -none- NULL
ntree 1 -none- numeric
mtry 1 -none- numeric
forest 14 -none- list
y 297 factor numeric
test 0 -none- NULL
inbag 0 -none- NULL
xNames 10 -none- character
problemType 1 -none- character
tuneValue 1 data.frame list
obsLevels 2 -none- character
param 0 -none- list
model.rf$finalModel
Call:
randomForest(x = x, y = y, mtry = param$mtry)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 21.89%
Confusion matrix:
No Yes class.error
No 127 33 0.2062500
Yes 32 105 0.2335766
plot(model.rf$finalModel)
varImp(model.rf$finalModel)
Overall
Age 17.653238
as.factor(Sex)1 5.900035
as.factor(Thal)normal 15.913850
as.factor(Thal)reversable 13.057684
Chol 15.302029
MaxHR 23.443338
RestBP 13.918624
Fbs 2.493889
RestECG 4.040312
ExAng 10.959801
plot( varImp(model.rf) )
yhat = predict(model.rf$finalModel, type="prob")[,1]
plot(HeartData$MaxHR, yhat)
scatter.smooth(HeartData$MaxHR, yhat, span=.3)
confusionMatrix(predict(model.rf, HeartData),
reference=HeartData$HD, positive="Yes")
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 154 5
Yes 6 132
Accuracy : 0.963
95% CI : (0.9347, 0.9814)
No Information Rate : 0.5387
P-Value [Acc > NIR] : <0.0000000000000002
Kappa : 0.9255
Mcnemar's Test P-Value : 1
Sensitivity : 0.9635
Specificity : 0.9625
Pos Pred Value : 0.9565
Neg Pred Value : 0.9686
Prevalence : 0.4613
Detection Rate : 0.4444
Detection Prevalence : 0.4646
Balanced Accuracy : 0.9630
'Positive' Class : Yes
#cart model for under 50 vs over 50
#train on everyone
model.cartl50 <- train(
HD ~ Age + as.factor(Sex) + as.factor(Thal)
+ Chol + MaxHR + RestBP + Fbs + RestECG + ExAng,
data = HeartData,
method = "rpart",
trControl = train_model)
#predict on under 50
confusionMatrix(predict(model.cartl50, HeartData[HeartData$Age<50,]),
reference=HeartData[HeartData$Age<50,]$HD, positive="Yes")
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 54 6
Yes 6 19
Accuracy : 0.8588
95% CI : (0.7664, 0.9249)
No Information Rate : 0.7059
P-Value [Acc > NIR] : 0.0007925
Kappa : 0.66
Mcnemar's Test P-Value : 1.0000000
Sensitivity : 0.7600
Specificity : 0.9000
Pos Pred Value : 0.7600
Neg Pred Value : 0.9000
Prevalence : 0.2941
Detection Rate : 0.2235
Detection Prevalence : 0.2941
Balanced Accuracy : 0.8300
'Positive' Class : Yes
#predict on over 50
confusionMatrix(predict(model.cartl50, HeartData[HeartData$Age>=50,]),
reference=HeartData[HeartData$Age>=50,]$HD, positive="Yes")
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 73 31
Yes 27 81
Accuracy : 0.7264
95% CI : (0.6611, 0.7852)
No Information Rate : 0.5283
P-Value [Acc > NIR] : 0.000000002763
Kappa : 0.4522
Mcnemar's Test P-Value : 0.6936
Sensitivity : 0.7232
Specificity : 0.7300
Pos Pred Value : 0.7500
Neg Pred Value : 0.7019
Prevalence : 0.5283
Detection Rate : 0.3821
Detection Prevalence : 0.5094
Balanced Accuracy : 0.7266
'Positive' Class : Yes
AHD=HeartData$AHD
Chol= HeartData$Chol
par(mfrow=c(2,2))
boxplot(MaxHR[HeartData$Age<50] ~ as.factor(AHD[HeartData$Age<50]),ylim=c(80,200))
boxplot(MaxHR[HeartData$Age>=50] ~ as.factor(AHD[HeartData$Age>=50]),ylim=c(80,200))
boxplot(Chol[HeartData$Age<50] ~ as.factor(AHD[HeartData$Age<50]),ylim=c(100,400))
boxplot(Chol[HeartData$Age>=50] ~ as.factor(AHD[HeartData$Age>=50]),ylim=c(100,400))
#categorical variable distribution differences
Thal= HeartData$Thal
par(mfrow=c(1,2))
plot(as.factor(Thal[HeartData$Age<50]), xlab="Less than 50", ylab="Count")
plot(as.factor(Thal[HeartData$Age>=50]), xlab="50 and older", ylab="Count")