LAB 5

library(rpart)

## Warning: package 'rpart' was built under R version 4.1.2

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.1.2

v <- iris$Species

table(v)

## v
##     setosa versicolor  virginica 
##         50         50         50

set.seed(522)

# runif function returns a uniform distribution which can be further conditionally split into 75-25 ratio
iris[, 'train'] <- ifelse(runif(nrow(iris)) < 0.75, 1, 0)

trainSet <- iris[iris$train == 1,]
testSet <- iris[iris$train == 0, ]

trainColNum <- grep('train', names(trainSet))

trainSet <- trainSet[, -trainColNum]
testSet <- testSet[, -trainColNum]

treeFit <- rpart(Species~.,data=trainSet,method = 'class')
print(treeFit)

## n= 111 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 111 74 setosa (0.33333333 0.33333333 0.33333333)  
##   2) Petal.Length< 2.45 37  0 setosa (1.00000000 0.00000000 0.00000000) *
##   3) Petal.Length>=2.45 74 37 versicolor (0.00000000 0.50000000 0.50000000)  
##     6) Petal.Width< 1.75 39  2 versicolor (0.00000000 0.94871795 0.05128205) *
##     7) Petal.Width>=1.75 35  0 virginica (0.00000000 0.00000000 1.00000000) *

rpart.plot(treeFit, box.col=c("red", "green"))

Prediction1 <- predict(treeFit,newdata=testSet[-5],type = 'class')

## Print the confusion matrix to check the accuracy and other statistics
library(caret)

## Warning: package 'caret' was built under R version 4.1.2

## Loading required package: ggplot2

## Loading required package: lattice

confusionMatrix(Prediction1,testSet$Species)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         13          0         0
##   versicolor      0         12         3
##   virginica       0          1        10
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8974          
##                  95% CI : (0.7578, 0.9713)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : 3.435e-13       
##                                           
##                   Kappa : 0.8462          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9231           0.7692
## Specificity                 1.0000            0.8846           0.9615
## Pos Pred Value              1.0000            0.8000           0.9091
## Neg Pred Value              1.0000            0.9583           0.8929
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3077           0.2564
## Detection Prevalence        0.3333            0.3846           0.2821
## Balanced Accuracy           1.0000            0.9038           0.8654

## Pruning the decision tree
printcp(treeFit)

## 
## Classification tree:
## rpart(formula = Species ~ ., data = trainSet, method = "class")
## 
## Variables actually used in tree construction:
## [1] Petal.Length Petal.Width 
## 
## Root node error: 74/111 = 0.66667
## 
## n= 111 
## 
##        CP nsplit rel error   xerror     xstd
## 1 0.50000      0  1.000000 1.189189 0.057705
## 2 0.47297      1  0.500000 0.783784 0.071115
## 3 0.01000      2  0.027027 0.067568 0.029529

opt  <-  which.min(treeFit$cptable[,'xerror'])

cp <-  treeFit$cptable[opt, 'CP']
pruned_model <-  prune(treeFit,cp)
rpart.plot(pruned_model, box.col=c("red", "green"))

LAB 5

Akshay Palani

17/01/2022