Decision Tree in R

#install.packages("c50")
library(C50)
#If you have data set in R to read inside 
#data("iris")
#iris
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
#Splitting data set into traing and testing. as species in the order
#Splitting data based on the species
iris_setosa <- iris[iris$Species == "setosa", ]
iris_versicolor <- iris[iris$Species == "versicolor",]
iris_virginica <- iris[iris$Species == "virginica",]

#splitting data sequentially *optiional

iris_train <- rbind(iris_setosa[1:35,], iris_versicolor[1:35,], iris_virginica[1:35,])
iris_test <- rbind(iris_setosa[36:50,], iris_versicolor[36:50,], iris_virginica[36:50,])

#spliting randomly
#install caret lib which is used to split the dataset

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
attach(iris)
inTrainingData <- createDataPartition(y= Species, p=0.70, list = FALSE)
trainData <- iris[inTrainingData,]
testData <- iris[-inTrainingData,]

Builing model on traing data set

dtModel <- C5.0(trainData[,-5], trainData$Species)
plot(dtModel)

#Checking accuracy of the training data model

predict(dtModel, testData)
##  [1] setosa     setosa     setosa     setosa     setosa     setosa    
##  [7] setosa     setosa     setosa     setosa     setosa     setosa    
## [13] setosa     setosa     setosa     versicolor versicolor versicolor
## [19] versicolor virginica  versicolor versicolor versicolor versicolor
## [25] versicolor versicolor versicolor versicolor versicolor versicolor
## [31] virginica  virginica  virginica  versicolor virginica  virginica 
## [37] virginica  virginica  virginica  virginica  virginica  virginica 
## [43] virginica  virginica  virginica 
## Levels: setosa versicolor virginica
summary(testData)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width 
##  Min.   :4.400   Min.   :2.300   Min.   :1.000   Min.   :0.1  
##  1st Qu.:5.000   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.3  
##  Median :5.800   Median :3.000   Median :4.300   Median :1.3  
##  Mean   :5.767   Mean   :3.073   Mean   :3.729   Mean   :1.2  
##  3rd Qu.:6.300   3rd Qu.:3.400   3rd Qu.:5.100   3rd Qu.:1.8  
##  Max.   :7.700   Max.   :4.400   Max.   :6.700   Max.   :2.5  
##        Species  
##  setosa    :15  
##  versicolor:15  
##  virginica :15  
##                 
##                 
## 
(testData$Species == predict(dtModel, testData))
##  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [12]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE
## [23]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [34] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [45]  TRUE
mean(testData$Species == predict(dtModel, testData))
## [1] 0.9555556
43/45
## [1] 0.9555556
#cross table
library(gmodels)
CrossTable(testData$Species,predict(dtModel, testData))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  45 
## 
##  
##                  | predict(dtModel, testData) 
## testData$Species |     setosa | versicolor |  virginica |  Row Total | 
## -----------------|------------|------------|------------|------------|
##           setosa |         15 |          0 |          0 |         15 | 
##                  |     20.000 |      5.000 |      5.000 |            | 
##                  |      1.000 |      0.000 |      0.000 |      0.333 | 
##                  |      1.000 |      0.000 |      0.000 |            | 
##                  |      0.333 |      0.000 |      0.000 |            | 
## -----------------|------------|------------|------------|------------|
##       versicolor |          0 |         14 |          1 |         15 | 
##                  |      5.000 |     16.200 |      3.200 |            | 
##                  |      0.000 |      0.933 |      0.067 |      0.333 | 
##                  |      0.000 |      0.933 |      0.067 |            | 
##                  |      0.000 |      0.311 |      0.022 |            | 
## -----------------|------------|------------|------------|------------|
##        virginica |          0 |          1 |         14 |         15 | 
##                  |      5.000 |      3.200 |     16.200 |            | 
##                  |      0.000 |      0.067 |      0.933 |      0.333 | 
##                  |      0.000 |      0.067 |      0.933 |            | 
##                  |      0.000 |      0.022 |      0.311 |            | 
## -----------------|------------|------------|------------|------------|
##     Column Total |         15 |         15 |         15 |         45 | 
##                  |      0.333 |      0.333 |      0.333 |            | 
## -----------------|------------|------------|------------|------------|
## 
## 
CrossTable(testData$Species == predict(dtModel, testData))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  45 
## 
##  
##           |     FALSE |      TRUE | 
##           |-----------|-----------|
##           |         2 |        43 | 
##           |     0.044 |     0.956 | 
##           |-----------|-----------|
## 
## 
## 
##