# library for Classification & Regression Trees
library(xtable)
# library for Ionosphere data
library(rpart)
#iris data set gives the measurements in centimeters of the 
#variables sepal length and width and petal length and width, 
#respectively, for 50 flowers from each of 3 species of iris
data("iris")
# Upload Iris dataset into a variable ir
ir<-iris
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
#Converting the class variable into factor
ir$Species<-as.factor(ir$Species) 
ir$Species<-as.numeric(ir$Species)  
str(ir)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : num  1 1 1 1 1 1 1 1 1 1 ...
#There are three level in the class variable, 1 as setosa and 2 as versicolor
#3 as Virginica
v<-ir$Species
table(v)
## v
##  1  2  3 
## 50 50 50
#set seed to ensure reproducible results
set.seed(250)
#spliting into training and test data sets in 3:1 ratio
ir[,'train'] <- ifelse(runif(nrow(ir))<0.75,1,0)
#separate training and test sets
train_iris <- ir[ir$train==1,]
test_iris <- ir[ir$train==0,]
#get column index of train flag
iris_trainColNum <- grep('train',names(train_iris))
str(test_iris)
## 'data.frame':    36 obs. of  6 variables:
##  $ Sepal.Length: num  4.9 4.6 5 5.4 4.6 4.9 5.1 5 5.2 4.8 ...
##  $ Sepal.Width : num  3 3.1 3.6 3.9 3.4 3.1 3.7 3 3.5 3.1 ...
##  $ Petal.Length: num  1.4 1.5 1.4 1.7 1.4 1.5 1.5 1.6 1.5 1.6 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.4 0.3 0.1 0.4 0.2 0.2 0.2 ...
##  $ Species     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ train       : num  0 0 0 0 0 0 0 0 0 0 ...
#Obtaining the train and test data set
#remove train flag column from train and test sets
train_iris <- train_iris[,-iris_trainColNum]
test_iris <- test_iris[,-iris_trainColNum]
#Get column index of predicted variable in dataset
typeColNum_iris <- grep('Species',names(ir))
#Constructing the required Decision tree model
rpart_model_iris <- rpart(Species~.,data = train_iris, method= 'class')
# Plotting the tree
plot(rpart_model_iris)
text(rpart_model_iris)

summary(rpart_model_iris)
## Call:
## rpart(formula = Species ~ ., data = train_iris, method = "class")
##   n= 114 
## 
##          CP nsplit  rel error     xerror       xstd
## 1 0.4933333      0 1.00000000 1.14666667 0.06127941
## 2 0.4533333      1 0.50666667 0.77333333 0.07116957
## 3 0.0100000      2 0.05333333 0.09333333 0.03417647
## 
## Variable importance
## Petal.Length  Petal.Width Sepal.Length  Sepal.Width 
##           33           30           23           14 
## 
## Node number 1: 114 observations,    complexity param=0.4933333
##   predicted class=2  expected loss=0.6578947  P(node) =1
##     class counts:    37    39    38
##    probabilities: 0.325 0.342 0.333 
##   left son=2 (37 obs) right son=3 (77 obs)
##   Primary splits:
##       Petal.Length < 2.6  to the left,  improve=37.48895, (0 missing)
##       Petal.Width  < 0.8  to the left,  improve=37.48895, (0 missing)
##       Sepal.Length < 5.45 to the left,  improve=24.13058, (0 missing)
##       Sepal.Width  < 3.35 to the right, improve=15.91103, (0 missing)
##   Surrogate splits:
##       Petal.Width  < 0.8  to the left,  agree=1.000, adj=1.000, (0 split)
##       Sepal.Length < 5.45 to the left,  agree=0.912, adj=0.730, (0 split)
##       Sepal.Width  < 3.35 to the right, agree=0.851, adj=0.541, (0 split)
## 
## Node number 2: 37 observations
##   predicted class=1  expected loss=0  P(node) =0.3245614
##     class counts:    37     0     0
##    probabilities: 1.000 0.000 0.000 
## 
## Node number 3: 77 observations,    complexity param=0.4533333
##   predicted class=2  expected loss=0.4935065  P(node) =0.6754386
##     class counts:     0    39    38
##    probabilities: 0.000 0.506 0.494 
##   left son=6 (37 obs) right son=7 (40 obs)
##   Primary splits:
##       Petal.Length < 4.85 to the left,  improve=30.997560, (0 missing)
##       Petal.Width  < 1.75 to the left,  improve=29.312550, (0 missing)
##       Sepal.Length < 6.25 to the left,  improve=11.055410, (0 missing)
##       Sepal.Width  < 2.45 to the left,  improve= 2.980435, (0 missing)
##   Surrogate splits:
##       Petal.Width  < 1.65 to the left,  agree=0.909, adj=0.811, (0 split)
##       Sepal.Length < 6.25 to the left,  agree=0.818, adj=0.622, (0 split)
##       Sepal.Width  < 2.95 to the left,  agree=0.649, adj=0.270, (0 split)
## 
## Node number 6: 37 observations
##   predicted class=2  expected loss=0.02702703  P(node) =0.3245614
##     class counts:     0    36     1
##    probabilities: 0.000 0.973 0.027 
## 
## Node number 7: 40 observations
##   predicted class=3  expected loss=0.075  P(node) =0.3508772
##     class counts:     0     3    37
##    probabilities: 0.000 0.075 0.925
#Checking how good the model is
rpart_predict_iris<- predict(rpart_model_iris,test_iris[,-typeColNum_iris],type='class')
mn_iris <- mean(rpart_predict_iris==test_iris$Species)
mn_iris
## [1] 0.9166667
# Constructing the confusion matrix to find out the efficiency of the model
table(pred=rpart_predict_iris,true=test_iris$Species)
##     true
## pred  1  2  3
##    1 13  0  0
##    2  0 10  2
##    3  0  1 10
# Applying the cost-complexity pruning
printcp(rpart_model_iris)
## 
## Classification tree:
## rpart(formula = Species ~ ., data = train_iris, method = "class")
## 
## Variables actually used in tree construction:
## [1] Petal.Length
## 
## Root node error: 75/114 = 0.65789
## 
## n= 114 
## 
##        CP nsplit rel error   xerror     xstd
## 1 0.49333      0  1.000000 1.146667 0.061279
## 2 0.45333      1  0.506667 0.773333 0.071170
## 3 0.01000      2  0.053333 0.093333 0.034176
#Finding index of CP with lowest xerror
opt_iris <- which.min(rpart_model_iris$cptable[,'xerror'])
#Finding the values of CP# no pruning
cp_iris <- rpart_model_iris$cptable[opt_iris, 'CP' ]
cp_iris
## [1] 0.01
pruned_model_iris <- prune(rpart_model_iris,cp_iris)
#plot tree
plot(pruned_model_iris)
text(pruned_model_iris)  

# Pruning is not required
summary(pruned_model_iris)
## Call:
## rpart(formula = Species ~ ., data = train_iris, method = "class")
##   n= 114 
## 
##          CP nsplit  rel error     xerror       xstd
## 1 0.4933333      0 1.00000000 1.14666667 0.06127941
## 2 0.4533333      1 0.50666667 0.77333333 0.07116957
## 3 0.0100000      2 0.05333333 0.09333333 0.03417647
## 
## Variable importance
## Petal.Length  Petal.Width Sepal.Length  Sepal.Width 
##           33           30           23           14 
## 
## Node number 1: 114 observations,    complexity param=0.4933333
##   predicted class=2  expected loss=0.6578947  P(node) =1
##     class counts:    37    39    38
##    probabilities: 0.325 0.342 0.333 
##   left son=2 (37 obs) right son=3 (77 obs)
##   Primary splits:
##       Petal.Length < 2.6  to the left,  improve=37.48895, (0 missing)
##       Petal.Width  < 0.8  to the left,  improve=37.48895, (0 missing)
##       Sepal.Length < 5.45 to the left,  improve=24.13058, (0 missing)
##       Sepal.Width  < 3.35 to the right, improve=15.91103, (0 missing)
##   Surrogate splits:
##       Petal.Width  < 0.8  to the left,  agree=1.000, adj=1.000, (0 split)
##       Sepal.Length < 5.45 to the left,  agree=0.912, adj=0.730, (0 split)
##       Sepal.Width  < 3.35 to the right, agree=0.851, adj=0.541, (0 split)
## 
## Node number 2: 37 observations
##   predicted class=1  expected loss=0  P(node) =0.3245614
##     class counts:    37     0     0
##    probabilities: 1.000 0.000 0.000 
## 
## Node number 3: 77 observations,    complexity param=0.4533333
##   predicted class=2  expected loss=0.4935065  P(node) =0.6754386
##     class counts:     0    39    38
##    probabilities: 0.000 0.506 0.494 
##   left son=6 (37 obs) right son=7 (40 obs)
##   Primary splits:
##       Petal.Length < 4.85 to the left,  improve=30.997560, (0 missing)
##       Petal.Width  < 1.75 to the left,  improve=29.312550, (0 missing)
##       Sepal.Length < 6.25 to the left,  improve=11.055410, (0 missing)
##       Sepal.Width  < 2.45 to the left,  improve= 2.980435, (0 missing)
##   Surrogate splits:
##       Petal.Width  < 1.65 to the left,  agree=0.909, adj=0.811, (0 split)
##       Sepal.Length < 6.25 to the left,  agree=0.818, adj=0.622, (0 split)
##       Sepal.Width  < 2.95 to the left,  agree=0.649, adj=0.270, (0 split)
## 
## Node number 6: 37 observations
##   predicted class=2  expected loss=0.02702703  P(node) =0.3245614
##     class counts:     0    36     1
##    probabilities: 0.000 0.973 0.027 
## 
## Node number 7: 40 observations
##   predicted class=3  expected loss=0.075  P(node) =0.3508772
##     class counts:     0     3    37
##    probabilities: 0.000 0.075 0.925
summary(rpart_model_iris)
## Call:
## rpart(formula = Species ~ ., data = train_iris, method = "class")
##   n= 114 
## 
##          CP nsplit  rel error     xerror       xstd
## 1 0.4933333      0 1.00000000 1.14666667 0.06127941
## 2 0.4533333      1 0.50666667 0.77333333 0.07116957
## 3 0.0100000      2 0.05333333 0.09333333 0.03417647
## 
## Variable importance
## Petal.Length  Petal.Width Sepal.Length  Sepal.Width 
##           33           30           23           14 
## 
## Node number 1: 114 observations,    complexity param=0.4933333
##   predicted class=2  expected loss=0.6578947  P(node) =1
##     class counts:    37    39    38
##    probabilities: 0.325 0.342 0.333 
##   left son=2 (37 obs) right son=3 (77 obs)
##   Primary splits:
##       Petal.Length < 2.6  to the left,  improve=37.48895, (0 missing)
##       Petal.Width  < 0.8  to the left,  improve=37.48895, (0 missing)
##       Sepal.Length < 5.45 to the left,  improve=24.13058, (0 missing)
##       Sepal.Width  < 3.35 to the right, improve=15.91103, (0 missing)
##   Surrogate splits:
##       Petal.Width  < 0.8  to the left,  agree=1.000, adj=1.000, (0 split)
##       Sepal.Length < 5.45 to the left,  agree=0.912, adj=0.730, (0 split)
##       Sepal.Width  < 3.35 to the right, agree=0.851, adj=0.541, (0 split)
## 
## Node number 2: 37 observations
##   predicted class=1  expected loss=0  P(node) =0.3245614
##     class counts:    37     0     0
##    probabilities: 1.000 0.000 0.000 
## 
## Node number 3: 77 observations,    complexity param=0.4533333
##   predicted class=2  expected loss=0.4935065  P(node) =0.6754386
##     class counts:     0    39    38
##    probabilities: 0.000 0.506 0.494 
##   left son=6 (37 obs) right son=7 (40 obs)
##   Primary splits:
##       Petal.Length < 4.85 to the left,  improve=30.997560, (0 missing)
##       Petal.Width  < 1.75 to the left,  improve=29.312550, (0 missing)
##       Sepal.Length < 6.25 to the left,  improve=11.055410, (0 missing)
##       Sepal.Width  < 2.45 to the left,  improve= 2.980435, (0 missing)
##   Surrogate splits:
##       Petal.Width  < 1.65 to the left,  agree=0.909, adj=0.811, (0 split)
##       Sepal.Length < 6.25 to the left,  agree=0.818, adj=0.622, (0 split)
##       Sepal.Width  < 2.95 to the left,  agree=0.649, adj=0.270, (0 split)
## 
## Node number 6: 37 observations
##   predicted class=2  expected loss=0.02702703  P(node) =0.3245614
##     class counts:     0    36     1
##    probabilities: 0.000 0.973 0.027 
## 
## Node number 7: 40 observations
##   predicted class=3  expected loss=0.075  P(node) =0.3508772
##     class counts:     0     3    37
##    probabilities: 0.000 0.075 0.925