# library for Classification & Regression Trees
library(xtable)
# library for Ionosphere data
library(rpart)
#iris data set gives the measurements in centimeters of the
#variables sepal length and width and petal length and width,
#respectively, for 50 flowers from each of 3 species of iris
data("iris")
# Upload Iris dataset into a variable ir
ir<-iris
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
#Converting the class variable into factor
ir$Species<-as.factor(ir$Species)
ir$Species<-as.numeric(ir$Species)
str(ir)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : num 1 1 1 1 1 1 1 1 1 1 ...
#There are three level in the class variable, 1 as setosa and 2 as versicolor
#3 as Virginica
v<-ir$Species
table(v)
## v
## 1 2 3
## 50 50 50
#set seed to ensure reproducible results
set.seed(250)
#spliting into training and test data sets in 3:1 ratio
ir[,'train'] <- ifelse(runif(nrow(ir))<0.75,1,0)
#separate training and test sets
train_iris <- ir[ir$train==1,]
test_iris <- ir[ir$train==0,]
#get column index of train flag
iris_trainColNum <- grep('train',names(train_iris))
str(test_iris)
## 'data.frame': 36 obs. of 6 variables:
## $ Sepal.Length: num 4.9 4.6 5 5.4 4.6 4.9 5.1 5 5.2 4.8 ...
## $ Sepal.Width : num 3 3.1 3.6 3.9 3.4 3.1 3.7 3 3.5 3.1 ...
## $ Petal.Length: num 1.4 1.5 1.4 1.7 1.4 1.5 1.5 1.6 1.5 1.6 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.4 0.3 0.1 0.4 0.2 0.2 0.2 ...
## $ Species : num 1 1 1 1 1 1 1 1 1 1 ...
## $ train : num 0 0 0 0 0 0 0 0 0 0 ...
#Obtaining the train and test data set
#remove train flag column from train and test sets
train_iris <- train_iris[,-iris_trainColNum]
test_iris <- test_iris[,-iris_trainColNum]
#Get column index of predicted variable in dataset
typeColNum_iris <- grep('Species',names(ir))
#Constructing the required Decision tree model
rpart_model_iris <- rpart(Species~.,data = train_iris, method= 'class')
# Plotting the tree
plot(rpart_model_iris)
text(rpart_model_iris)

summary(rpart_model_iris)
## Call:
## rpart(formula = Species ~ ., data = train_iris, method = "class")
## n= 114
##
## CP nsplit rel error xerror xstd
## 1 0.4933333 0 1.00000000 1.14666667 0.06127941
## 2 0.4533333 1 0.50666667 0.77333333 0.07116957
## 3 0.0100000 2 0.05333333 0.09333333 0.03417647
##
## Variable importance
## Petal.Length Petal.Width Sepal.Length Sepal.Width
## 33 30 23 14
##
## Node number 1: 114 observations, complexity param=0.4933333
## predicted class=2 expected loss=0.6578947 P(node) =1
## class counts: 37 39 38
## probabilities: 0.325 0.342 0.333
## left son=2 (37 obs) right son=3 (77 obs)
## Primary splits:
## Petal.Length < 2.6 to the left, improve=37.48895, (0 missing)
## Petal.Width < 0.8 to the left, improve=37.48895, (0 missing)
## Sepal.Length < 5.45 to the left, improve=24.13058, (0 missing)
## Sepal.Width < 3.35 to the right, improve=15.91103, (0 missing)
## Surrogate splits:
## Petal.Width < 0.8 to the left, agree=1.000, adj=1.000, (0 split)
## Sepal.Length < 5.45 to the left, agree=0.912, adj=0.730, (0 split)
## Sepal.Width < 3.35 to the right, agree=0.851, adj=0.541, (0 split)
##
## Node number 2: 37 observations
## predicted class=1 expected loss=0 P(node) =0.3245614
## class counts: 37 0 0
## probabilities: 1.000 0.000 0.000
##
## Node number 3: 77 observations, complexity param=0.4533333
## predicted class=2 expected loss=0.4935065 P(node) =0.6754386
## class counts: 0 39 38
## probabilities: 0.000 0.506 0.494
## left son=6 (37 obs) right son=7 (40 obs)
## Primary splits:
## Petal.Length < 4.85 to the left, improve=30.997560, (0 missing)
## Petal.Width < 1.75 to the left, improve=29.312550, (0 missing)
## Sepal.Length < 6.25 to the left, improve=11.055410, (0 missing)
## Sepal.Width < 2.45 to the left, improve= 2.980435, (0 missing)
## Surrogate splits:
## Petal.Width < 1.65 to the left, agree=0.909, adj=0.811, (0 split)
## Sepal.Length < 6.25 to the left, agree=0.818, adj=0.622, (0 split)
## Sepal.Width < 2.95 to the left, agree=0.649, adj=0.270, (0 split)
##
## Node number 6: 37 observations
## predicted class=2 expected loss=0.02702703 P(node) =0.3245614
## class counts: 0 36 1
## probabilities: 0.000 0.973 0.027
##
## Node number 7: 40 observations
## predicted class=3 expected loss=0.075 P(node) =0.3508772
## class counts: 0 3 37
## probabilities: 0.000 0.075 0.925
#Checking how good the model is
rpart_predict_iris<- predict(rpart_model_iris,test_iris[,-typeColNum_iris],type='class')
mn_iris <- mean(rpart_predict_iris==test_iris$Species)
mn_iris
## [1] 0.9166667
# Constructing the confusion matrix to find out the efficiency of the model
table(pred=rpart_predict_iris,true=test_iris$Species)
## true
## pred 1 2 3
## 1 13 0 0
## 2 0 10 2
## 3 0 1 10
# Applying the cost-complexity pruning
printcp(rpart_model_iris)
##
## Classification tree:
## rpart(formula = Species ~ ., data = train_iris, method = "class")
##
## Variables actually used in tree construction:
## [1] Petal.Length
##
## Root node error: 75/114 = 0.65789
##
## n= 114
##
## CP nsplit rel error xerror xstd
## 1 0.49333 0 1.000000 1.146667 0.061279
## 2 0.45333 1 0.506667 0.773333 0.071170
## 3 0.01000 2 0.053333 0.093333 0.034176
#Finding index of CP with lowest xerror
opt_iris <- which.min(rpart_model_iris$cptable[,'xerror'])
#Finding the values of CP# no pruning
cp_iris <- rpart_model_iris$cptable[opt_iris, 'CP' ]
cp_iris
## [1] 0.01
pruned_model_iris <- prune(rpart_model_iris,cp_iris)
#plot tree
plot(pruned_model_iris)
text(pruned_model_iris)

# Pruning is not required
summary(pruned_model_iris)
## Call:
## rpart(formula = Species ~ ., data = train_iris, method = "class")
## n= 114
##
## CP nsplit rel error xerror xstd
## 1 0.4933333 0 1.00000000 1.14666667 0.06127941
## 2 0.4533333 1 0.50666667 0.77333333 0.07116957
## 3 0.0100000 2 0.05333333 0.09333333 0.03417647
##
## Variable importance
## Petal.Length Petal.Width Sepal.Length Sepal.Width
## 33 30 23 14
##
## Node number 1: 114 observations, complexity param=0.4933333
## predicted class=2 expected loss=0.6578947 P(node) =1
## class counts: 37 39 38
## probabilities: 0.325 0.342 0.333
## left son=2 (37 obs) right son=3 (77 obs)
## Primary splits:
## Petal.Length < 2.6 to the left, improve=37.48895, (0 missing)
## Petal.Width < 0.8 to the left, improve=37.48895, (0 missing)
## Sepal.Length < 5.45 to the left, improve=24.13058, (0 missing)
## Sepal.Width < 3.35 to the right, improve=15.91103, (0 missing)
## Surrogate splits:
## Petal.Width < 0.8 to the left, agree=1.000, adj=1.000, (0 split)
## Sepal.Length < 5.45 to the left, agree=0.912, adj=0.730, (0 split)
## Sepal.Width < 3.35 to the right, agree=0.851, adj=0.541, (0 split)
##
## Node number 2: 37 observations
## predicted class=1 expected loss=0 P(node) =0.3245614
## class counts: 37 0 0
## probabilities: 1.000 0.000 0.000
##
## Node number 3: 77 observations, complexity param=0.4533333
## predicted class=2 expected loss=0.4935065 P(node) =0.6754386
## class counts: 0 39 38
## probabilities: 0.000 0.506 0.494
## left son=6 (37 obs) right son=7 (40 obs)
## Primary splits:
## Petal.Length < 4.85 to the left, improve=30.997560, (0 missing)
## Petal.Width < 1.75 to the left, improve=29.312550, (0 missing)
## Sepal.Length < 6.25 to the left, improve=11.055410, (0 missing)
## Sepal.Width < 2.45 to the left, improve= 2.980435, (0 missing)
## Surrogate splits:
## Petal.Width < 1.65 to the left, agree=0.909, adj=0.811, (0 split)
## Sepal.Length < 6.25 to the left, agree=0.818, adj=0.622, (0 split)
## Sepal.Width < 2.95 to the left, agree=0.649, adj=0.270, (0 split)
##
## Node number 6: 37 observations
## predicted class=2 expected loss=0.02702703 P(node) =0.3245614
## class counts: 0 36 1
## probabilities: 0.000 0.973 0.027
##
## Node number 7: 40 observations
## predicted class=3 expected loss=0.075 P(node) =0.3508772
## class counts: 0 3 37
## probabilities: 0.000 0.075 0.925
summary(rpart_model_iris)
## Call:
## rpart(formula = Species ~ ., data = train_iris, method = "class")
## n= 114
##
## CP nsplit rel error xerror xstd
## 1 0.4933333 0 1.00000000 1.14666667 0.06127941
## 2 0.4533333 1 0.50666667 0.77333333 0.07116957
## 3 0.0100000 2 0.05333333 0.09333333 0.03417647
##
## Variable importance
## Petal.Length Petal.Width Sepal.Length Sepal.Width
## 33 30 23 14
##
## Node number 1: 114 observations, complexity param=0.4933333
## predicted class=2 expected loss=0.6578947 P(node) =1
## class counts: 37 39 38
## probabilities: 0.325 0.342 0.333
## left son=2 (37 obs) right son=3 (77 obs)
## Primary splits:
## Petal.Length < 2.6 to the left, improve=37.48895, (0 missing)
## Petal.Width < 0.8 to the left, improve=37.48895, (0 missing)
## Sepal.Length < 5.45 to the left, improve=24.13058, (0 missing)
## Sepal.Width < 3.35 to the right, improve=15.91103, (0 missing)
## Surrogate splits:
## Petal.Width < 0.8 to the left, agree=1.000, adj=1.000, (0 split)
## Sepal.Length < 5.45 to the left, agree=0.912, adj=0.730, (0 split)
## Sepal.Width < 3.35 to the right, agree=0.851, adj=0.541, (0 split)
##
## Node number 2: 37 observations
## predicted class=1 expected loss=0 P(node) =0.3245614
## class counts: 37 0 0
## probabilities: 1.000 0.000 0.000
##
## Node number 3: 77 observations, complexity param=0.4533333
## predicted class=2 expected loss=0.4935065 P(node) =0.6754386
## class counts: 0 39 38
## probabilities: 0.000 0.506 0.494
## left son=6 (37 obs) right son=7 (40 obs)
## Primary splits:
## Petal.Length < 4.85 to the left, improve=30.997560, (0 missing)
## Petal.Width < 1.75 to the left, improve=29.312550, (0 missing)
## Sepal.Length < 6.25 to the left, improve=11.055410, (0 missing)
## Sepal.Width < 2.45 to the left, improve= 2.980435, (0 missing)
## Surrogate splits:
## Petal.Width < 1.65 to the left, agree=0.909, adj=0.811, (0 split)
## Sepal.Length < 6.25 to the left, agree=0.818, adj=0.622, (0 split)
## Sepal.Width < 2.95 to the left, agree=0.649, adj=0.270, (0 split)
##
## Node number 6: 37 observations
## predicted class=2 expected loss=0.02702703 P(node) =0.3245614
## class counts: 0 36 1
## probabilities: 0.000 0.973 0.027
##
## Node number 7: 40 observations
## predicted class=3 expected loss=0.075 P(node) =0.3508772
## class counts: 0 3 37
## probabilities: 0.000 0.075 0.925