Decision Tree in R
#install.packages("c50")
library(C50)
#If you have data set in R to read inside
#data("iris")
#iris
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
#Splitting data set into traing and testing. as species in the order
#Splitting data based on the species
iris_setosa <- iris[iris$Species == "setosa", ]
iris_versicolor <- iris[iris$Species == "versicolor",]
iris_virginica <- iris[iris$Species == "virginica",]
#splitting data sequentially *optiional
iris_train <- rbind(iris_setosa[1:35,], iris_versicolor[1:35,], iris_virginica[1:35,])
iris_test <- rbind(iris_setosa[36:50,], iris_versicolor[36:50,], iris_virginica[36:50,])
#spliting randomly
#install caret lib which is used to split the dataset
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
attach(iris)
inTrainingData <- createDataPartition(y= Species, p=0.70, list = FALSE)
trainData <- iris[inTrainingData,]
testData <- iris[-inTrainingData,]
Builing model on traing data set
dtModel <- C5.0(trainData[,-5], trainData$Species)
plot(dtModel)

#Checking accuracy of the training data model
predict(dtModel, testData)
## [1] setosa setosa setosa setosa setosa setosa
## [7] setosa setosa setosa setosa setosa setosa
## [13] setosa setosa setosa versicolor versicolor versicolor
## [19] versicolor virginica versicolor versicolor versicolor versicolor
## [25] versicolor versicolor versicolor versicolor versicolor versicolor
## [31] virginica virginica virginica versicolor virginica virginica
## [37] virginica virginica virginica virginica virginica virginica
## [43] virginica virginica virginica
## Levels: setosa versicolor virginica
summary(testData)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.400 Min. :2.300 Min. :1.000 Min. :0.1
## 1st Qu.:5.000 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.3
## Median :5.800 Median :3.000 Median :4.300 Median :1.3
## Mean :5.767 Mean :3.073 Mean :3.729 Mean :1.2
## 3rd Qu.:6.300 3rd Qu.:3.400 3rd Qu.:5.100 3rd Qu.:1.8
## Max. :7.700 Max. :4.400 Max. :6.700 Max. :2.5
## Species
## setosa :15
## versicolor:15
## virginica :15
##
##
##
(testData$Species == predict(dtModel, testData))
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [12] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE
## [23] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [34] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [45] TRUE
mean(testData$Species == predict(dtModel, testData))
## [1] 0.9555556
43/45
## [1] 0.9555556
#cross table
library(gmodels)
CrossTable(testData$Species,predict(dtModel, testData))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 45
##
##
## | predict(dtModel, testData)
## testData$Species | setosa | versicolor | virginica | Row Total |
## -----------------|------------|------------|------------|------------|
## setosa | 15 | 0 | 0 | 15 |
## | 20.000 | 5.000 | 5.000 | |
## | 1.000 | 0.000 | 0.000 | 0.333 |
## | 1.000 | 0.000 | 0.000 | |
## | 0.333 | 0.000 | 0.000 | |
## -----------------|------------|------------|------------|------------|
## versicolor | 0 | 14 | 1 | 15 |
## | 5.000 | 16.200 | 3.200 | |
## | 0.000 | 0.933 | 0.067 | 0.333 |
## | 0.000 | 0.933 | 0.067 | |
## | 0.000 | 0.311 | 0.022 | |
## -----------------|------------|------------|------------|------------|
## virginica | 0 | 1 | 14 | 15 |
## | 5.000 | 3.200 | 16.200 | |
## | 0.000 | 0.067 | 0.933 | 0.333 |
## | 0.000 | 0.067 | 0.933 | |
## | 0.000 | 0.022 | 0.311 | |
## -----------------|------------|------------|------------|------------|
## Column Total | 15 | 15 | 15 | 45 |
## | 0.333 | 0.333 | 0.333 | |
## -----------------|------------|------------|------------|------------|
##
##
CrossTable(testData$Species == predict(dtModel, testData))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 45
##
##
## | FALSE | TRUE |
## |-----------|-----------|
## | 2 | 43 |
## | 0.044 | 0.956 |
## |-----------|-----------|
##
##
##
##