library(rpart)
## Warning: package 'rpart' was built under R version 4.1.2
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.1.2
v <- iris$Species
table(v)
## v
## setosa versicolor virginica
## 50 50 50
set.seed(522)
# runif function returns a uniform distribution which can be further conditionally split into 75-25 ratio
iris[, 'train'] <- ifelse(runif(nrow(iris)) < 0.75, 1, 0)
trainSet <- iris[iris$train == 1,]
testSet <- iris[iris$train == 0, ]
trainColNum <- grep('train', names(trainSet))
trainSet <- trainSet[, -trainColNum]
testSet <- testSet[, -trainColNum]
treeFit <- rpart(Species~.,data=trainSet,method = 'class')
print(treeFit)
## n= 111
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 111 74 setosa (0.33333333 0.33333333 0.33333333)
## 2) Petal.Length< 2.45 37 0 setosa (1.00000000 0.00000000 0.00000000) *
## 3) Petal.Length>=2.45 74 37 versicolor (0.00000000 0.50000000 0.50000000)
## 6) Petal.Width< 1.75 39 2 versicolor (0.00000000 0.94871795 0.05128205) *
## 7) Petal.Width>=1.75 35 0 virginica (0.00000000 0.00000000 1.00000000) *
rpart.plot(treeFit, box.col=c("red", "green"))

Prediction1 <- predict(treeFit,newdata=testSet[-5],type = 'class')
## Print the confusion matrix to check the accuracy and other statistics
library(caret)
## Warning: package 'caret' was built under R version 4.1.2
## Loading required package: ggplot2
## Loading required package: lattice
confusionMatrix(Prediction1,testSet$Species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 13 0 0
## versicolor 0 12 3
## virginica 0 1 10
##
## Overall Statistics
##
## Accuracy : 0.8974
## 95% CI : (0.7578, 0.9713)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 3.435e-13
##
## Kappa : 0.8462
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9231 0.7692
## Specificity 1.0000 0.8846 0.9615
## Pos Pred Value 1.0000 0.8000 0.9091
## Neg Pred Value 1.0000 0.9583 0.8929
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3077 0.2564
## Detection Prevalence 0.3333 0.3846 0.2821
## Balanced Accuracy 1.0000 0.9038 0.8654
## Pruning the decision tree
printcp(treeFit)
##
## Classification tree:
## rpart(formula = Species ~ ., data = trainSet, method = "class")
##
## Variables actually used in tree construction:
## [1] Petal.Length Petal.Width
##
## Root node error: 74/111 = 0.66667
##
## n= 111
##
## CP nsplit rel error xerror xstd
## 1 0.50000 0 1.000000 1.189189 0.057705
## 2 0.47297 1 0.500000 0.783784 0.071115
## 3 0.01000 2 0.027027 0.067568 0.029529
opt <- which.min(treeFit$cptable[,'xerror'])
cp <- treeFit$cptable[opt, 'CP']
pruned_model <- prune(treeFit,cp)
rpart.plot(pruned_model, box.col=c("red", "green"))
