options(scipen=999)
library(dplyr)
setwd("C:/Users/user/Desktop/noble")
HeartData=read.csv("Heart (1).csv", header=TRUE)
attach(HeartData)
count(HeartData)
n
1 303
head(HeartData,5)
X Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca
1 1 63 1 typical 145 233 1 2 150 0 2.3 3 0
2 2 67 1 asymptomatic 160 286 0 2 108 1 1.5 2 3
3 3 67 1 asymptomatic 120 229 0 2 129 1 2.6 2 2
4 4 37 1 nonanginal 130 250 0 0 187 0 3.5 3 0
5 5 41 0 nontypical 130 204 0 2 172 0 1.4 1 0
Thal AHD
1 fixed No
2 normal Yes
3 reversable Yes
4 normal No
5 normal No
library(ISLR2)
library(MASS)
library(splines)
library(pROC)
library(rattle)
library(caret)
attach(HeartData)
par(mfrow=c(2,2))
boxplot(Age ~ as.factor(AHD))
boxplot(MaxHR ~ as.factor(AHD))
boxplot(Chol ~ as.factor(AHD))
boxplot(Age ~ as.factor(AHD))
par(mfrow=c(1,1))
pairs( cbind( Chol, MaxHR, RestBP,Age), pch=19, lower.panel=NULL, cex=.5)
HeartData$HD = as.factor(AHD)
looking at classification based on p.hat = .5 cutoff
train_model <- trainControl(method = "repeatedcv", number = 5, repeats=10)
HeartData = na.omit(HeartData)
model.cart <- train(
HD ~ Age + as.factor(Sex) + as.factor(Thal)
+ Chol + MaxHR + RestBP + Fbs + RestECG + ExAng,
data = HeartData,
method = "rpart",
trControl = train_model)
model.cart
CART
297 samples
9 predictor
2 classes: 'No', 'Yes'
No pre-processing
Resampling: Cross-Validated (5 fold, repeated 10 times)
Summary of sample sizes: 237, 238, 238, 237, 238, 237, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.01094891 0.7360734 0.4676307
0.02189781 0.7404520 0.4775511
0.48905109 0.6205932 0.1972206
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.02189781.
model.cart$finalModel
n= 297
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 297 137 No (0.5387205 0.4612795)
2) as.factor(Thal)normal>=0.5 164 37 No (0.7743902 0.2256098) *
3) as.factor(Thal)normal< 0.5 133 33 Yes (0.2481203 0.7518797) *
confusionMatrix(predict(model.cart, HeartData),
reference=HeartData$HD, positive="Yes")
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 127 37
Yes 33 100
Accuracy : 0.7643
95% CI : (0.7118, 0.8114)
No Information Rate : 0.5387
P-Value [Acc > NIR] : 0.0000000000000007203
Kappa : 0.5248
Mcnemar's Test P-Value : 0.7199
Sensitivity : 0.7299
Specificity : 0.7937
Pos Pred Value : 0.7519
Neg Pred Value : 0.7744
Prevalence : 0.4613
Detection Rate : 0.3367
Detection Prevalence : 0.4478
Balanced Accuracy : 0.7618
'Positive' Class : Yes
library(rattle)
fancyRpartPlot(model.cart$finalModel)
model.rf <- train(
HD ~ Age + as.factor(Sex) + as.factor(Thal)
+ Chol + MaxHR + RestBP + Fbs + RestECG + ExAng,
data = HeartData,
method = "rf",
trControl = train_model)
model.rf
Random Forest
297 samples
9 predictor
2 classes: 'No', 'Yes'
No pre-processing
Resampling: Cross-Validated (5 fold, repeated 10 times)
Summary of sample sizes: 237, 238, 237, 238, 238, 237, ...
Resampling results across tuning parameters:
mtry Accuracy Kappa
2 0.7694011 0.5357627
6 0.7404576 0.4773831
10 0.7347232 0.4663167
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.
#######HD ~ Chol + MaxHR + RestBP + as.factor(Thal) + ExAng,
summary(model.rf$finalModel)
Length Class Mode
call 4 -none- call
type 1 -none- character
predicted 297 factor numeric
err.rate 1500 -none- numeric
confusion 6 -none- numeric
votes 594 matrix numeric
oob.times 297 -none- numeric
classes 2 -none- character
importance 10 -none- numeric
importanceSD 0 -none- NULL
localImportance 0 -none- NULL
proximity 0 -none- NULL
ntree 1 -none- numeric
mtry 1 -none- numeric
forest 14 -none- list
y 297 factor numeric
test 0 -none- NULL
inbag 0 -none- NULL
xNames 10 -none- character
problemType 1 -none- character
tuneValue 1 data.frame list
obsLevels 2 -none- character
param 0 -none- list
model.rf$finalModel
Call:
randomForest(x = x, y = y, mtry = param$mtry)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 24.24%
Confusion matrix:
No Yes class.error
No 126 34 0.2125000
Yes 38 99 0.2773723
plot(model.rf$finalModel)
varImp(model.rf$finalModel)
Overall
Age 18.008814
as.factor(Sex)1 5.823228
as.factor(Thal)normal 17.089918
as.factor(Thal)reversable 12.876562
Chol 14.827579
MaxHR 22.645941
RestBP 13.962065
Fbs 2.411935
RestECG 4.005220
ExAng 10.669930
plot( varImp(model.rf) )
yhat = predict(model.rf$finalModel, type="prob")[,1]
plot(HeartData$MaxHR, yhat)
scatter.smooth(HeartData$MaxHR, yhat, span=.3)
scatter.smooth(HeartData$Chol, yhat, span=.3)
##DAY 3
confusionMatrix(predict(model.rf, HeartData),
reference=HeartData$HD, positive="Yes")
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 158 6
Yes 2 131
Accuracy : 0.9731
95% CI : (0.9476, 0.9883)
No Information Rate : 0.5387
P-Value [Acc > NIR] : <0.0000000000000002
Kappa : 0.9457
Mcnemar's Test P-Value : 0.2888
Sensitivity : 0.9562
Specificity : 0.9875
Pos Pred Value : 0.9850
Neg Pred Value : 0.9634
Prevalence : 0.4613
Detection Rate : 0.4411
Detection Prevalence : 0.4478
Balanced Accuracy : 0.9719
'Positive' Class : Yes
######train on everyone
model.cartl50 <- train(
HD ~ Age + as.factor(Sex) + as.factor(Thal)
+ Chol + MaxHR + RestBP + Fbs + RestECG + ExAng,
data = HeartData,
method = "rpart",
trControl = train_model)
#predict on under 50
confusionMatrix(predict(model.cartl50, HeartData[HeartData$Age<50,]),
reference=HeartData[HeartData$Age<50,]$HD, positive="Yes")
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 54 6
Yes 6 19
Accuracy : 0.8588
95% CI : (0.7664, 0.9249)
No Information Rate : 0.7059
P-Value [Acc > NIR] : 0.0007925
Kappa : 0.66
Mcnemar's Test P-Value : 1.0000000
Sensitivity : 0.7600
Specificity : 0.9000
Pos Pred Value : 0.7600
Neg Pred Value : 0.9000
Prevalence : 0.2941
Detection Rate : 0.2235
Detection Prevalence : 0.2941
Balanced Accuracy : 0.8300
'Positive' Class : Yes
########predict on over 50
confusionMatrix(predict(model.cartl50, HeartData[HeartData$Age>=50,]),
reference=HeartData[HeartData$Age>=50,]$HD, positive="Yes")
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 73 31
Yes 27 81
Accuracy : 0.7264
95% CI : (0.6611, 0.7852)
No Information Rate : 0.5283
P-Value [Acc > NIR] : 0.000000002763
Kappa : 0.4522
Mcnemar's Test P-Value : 0.6936
Sensitivity : 0.7232
Specificity : 0.7300
Pos Pred Value : 0.7500
Neg Pred Value : 0.7019
Prevalence : 0.5283
Detection Rate : 0.3821
Detection Prevalence : 0.5094
Balanced Accuracy : 0.7266
'Positive' Class : Yes
par(mfrow=c(2,2))
boxplot(MaxHR[HeartData$Age<50] ~ as.factor(AHD[HeartData$Age<50]),ylim=c(80,200))
boxplot(MaxHR[HeartData$Age>=50] ~ as.factor(AHD[HeartData$Age>=50]),ylim=c(80,200))
boxplot(Chol[HeartData$Age<50] ~ as.factor(AHD[HeartData$Age<50]),ylim=c(100,400))
boxplot(Chol[HeartData$Age>=50] ~ as.factor(AHD[HeartData$Age>=50]),ylim=c(100,400))
######categorical variable distribution differences
par(mfrow=c(1,2))
plot(as.factor(Thal[HeartData$Age<50]), xlab="Less than 50", ylab="Count")
plot(as.factor(Thal[HeartData$Age>=50]), xlab="50 and older", ylab="Count")