set.seed(3333)
levels(trainData.df$Anaemic) <- c("N","Y")
# fitting decision tree classification model
DTModel <- train(Anaemic ~ bmi_3cat
+ place_resi
+ wealth_index
+ religion.1
+ education
+ caste
+ dietary.diversity.score,
data = trainData.df,
method = "rpart",
metric = "ROC",
parms = list(split = "gini"),
trControl = trctrl)
# model summary
DTModel## CART
##
## 499478 samples
## 7 predictor
## 2 classes: 'N', 'Y'
##
## No pre-processing
## Resampling: Cross-Validated (2 fold)
## Summary of sample sizes: 249740, 249738
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.0006257795 0.5631980 0.08617113
## 0.0124805811 0.5600907 0.06606325
## 0.0323348577 0.5498601 0.02283332
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.0006257795.
rpart.plot# viasulaziation
library(rpart.plot)
prp(DTModel$finalModel, box.palette = "Reds", tweak = 1.2, varlen = 20)# plot of probabilities
plot(PredDTModel$Y,
main = "Scatterplot of Probabilities of anaemic person",
xlab = "Customer ID",
ylab = "Predicted Probability of Anaemia")# taking the cut-off probability 50%
pred.DT <- ifelse(PredDTModel$Y > 0.50, "Y", "N")
# saving predicted vector as factor
Pred <- as.factor(pred.DT)
# ordering the vectors
levels(testData.df$Anaemic) <- c("N","Y")
Predicted <- ordered(Pred, levels = c( "Y", "N"))
Actual <- ordered(testData.df$Anaemic,levels = c( "Y", "N"))
# making confusion matrix
cm <-confusionMatrix(table(Predicted,Actual))
cm## Confusion Matrix and Statistics
##
## Actual
## Predicted Y N
## Y 53928 40264
## N 13812 16864
##
## Accuracy : 0.5669
## 95% CI : (0.5642, 0.5697)
## No Information Rate : 0.5425
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.0947
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7961
## Specificity : 0.2952
## Pos Pred Value : 0.5725
## Neg Pred Value : 0.5497
## Prevalence : 0.5425
## Detection Rate : 0.4319
## Detection Prevalence : 0.7543
## Balanced Accuracy : 0.5456
##
## 'Positive' Class : Y
##
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# function to print confusion matrices for diffrent cut-off levels of probability
CmFn <- function(cutoff) {
# predicting the test set results
PredDT <- predict(DTModel, testData.df, type = "prob")
C1 <- ifelse(PredDT$Y > cutoff, "Y", "N")
C2 <- testData.df$Anaemic
predY <- as.factor(C1)
actualY <- as.factor(C2)
Predicted <- ordered(predY, levels = c("Y", "N"))
Actual <- ordered(actualY,levels = c("Y", "N"))
# use the confusionMatrix from the caret package
cm1 <-confusionMatrix(data = Predicted,reference = Actual, positive = "Y")
# extracting accuracy
Accuracy <- cm1$overall[1]
# extracting sensitivity
Sensitivity <- cm1$byClass[1]
# extracting specificity
Specificity <- cm1$byClass[2]
# extracting value of kappa
Kappa <- cm1$overall[2]
# combined table
tab <- cbind(Accuracy,Sensitivity,Specificity,Kappa)
return(tab)}
# sequence of cut-off points
cutoff1 <- seq( 0, 1, by = .05 )
# loop using "lapply"
tab2 <- lapply(cutoff1, CmFn)
# creating matrix of different metrics
numrows = length(cutoff1)
pm <- matrix(1:numrows*4, nrow = numrows, ncol = 4)
# applying for loop
for (i in 1:numrows){
pm[i,] = tab2[[i]]}
pm <- as.data.frame(pm)
pm <- cbind(cutoff1, pm)
pm <- rename(pm, cutoff = cutoff1, Accuracy = V1,
Senstivity = V2 ,Specificity = V3, kappa = V4)
# printing the table
print(pm)## cutoff Accuracy Senstivity Specificity kappa
## 1 0.00 0.5424929 1.0000000 0.0000000 0.00000000
## 2 0.05 0.5424929 1.0000000 0.0000000 0.00000000
## 3 0.10 0.5424929 1.0000000 0.0000000 0.00000000
## 4 0.15 0.5424929 1.0000000 0.0000000 0.00000000
## 5 0.20 0.5424929 1.0000000 0.0000000 0.00000000
## 6 0.25 0.5424929 1.0000000 0.0000000 0.00000000
## 7 0.30 0.5424929 1.0000000 0.0000000 0.00000000
## 8 0.35 0.5424929 1.0000000 0.0000000 0.00000000
## 9 0.40 0.5424929 1.0000000 0.0000000 0.00000000
## 10 0.45 0.5578771 0.9440213 0.1000035 0.04710024
## 11 0.50 0.5669347 0.7961027 0.2951968 0.09473542
## 12 0.55 0.5669347 0.7961027 0.2951968 0.09473542
## 13 0.60 0.4575071 0.0000000 1.0000000 0.00000000
## 14 0.65 0.4575071 0.0000000 1.0000000 0.00000000
## 15 0.70 0.4575071 0.0000000 1.0000000 0.00000000
## 16 0.75 0.4575071 0.0000000 1.0000000 0.00000000
## 17 0.80 0.4575071 0.0000000 1.0000000 0.00000000
## 18 0.85 0.4575071 0.0000000 1.0000000 0.00000000
## 19 0.90 0.4575071 0.0000000 1.0000000 0.00000000
## 20 0.95 0.4575071 0.0000000 1.0000000 0.00000000
## 21 1.00 0.4575071 0.0000000 1.0000000 0.00000000
plot(pm$cutoff,pm$Senstivity,
pch = 16,
xlab = "Cut-off Probabilities",
ylab = "Performance",
ylim = c(0,1),
xlim = c(0,1),
type = "l",
lwd = 2,
col= "red")
lines(pm$cutoff, pm$Specificity,col= "blue",lwd = 2)
lines(pm$cutoff, pm$Accuracy,col= "black")
legend(0.1, 0.3, legend = c("Senstivity", "Specificity","Accuracy"), col=c("red","blue","green"), cex=0.8,lty=1)## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
PredDT <- predict(DTModel, testData.df, type = "prob")
Prediction <- prediction(PredDT$Y,testData.df$Anaemic)
performance <- performance(Prediction, "tpr","fpr")
# plotting ROC curve
plot(performance,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")library(ROCR)
# area under curve
Prediction <- prediction(PredDT$Y,testData.df$Anaemic)
aucDT <- performance(Prediction, measure = "auc")
aucDT <- aucDT@y.values[[1]]
aucDT## [1] 0.5475826