data mining
#install.packages("C50") # we neeed to install C50 package to use ak
#install.packages("tree")
library(C50)
## Warning: package 'C50' was built under R version 3.5.1
data()
# Splitting data into training and testing. As the species are in order
# splitting the data based on species
iris_setosa<-iris[iris$Species=="setosa",] # 50
iris_versicolor <- iris[iris$Species=="versicolor",] # 50
iris_virginica <- iris[iris$Species=="virginica",] # 50
iris_train <- rbind(iris_setosa[1:25,],iris_versicolor[1:25,],iris_virginica[1:25,])
iris_test <- rbind(iris_setosa[26:50,],iris_versicolor[26:50,],iris_virginica[26:50,])
# Building model on training data
irisc5.0_train <- C5.0(iris_train[,-5],iris_train$Species)
plot(irisc5.0_train) # Tree graph

# Training accuracy
mean(iris_train$Species==predict(irisc5.0_train,iris_train)) # 97.33% Accuracy
## [1] 0.9733333
predc5.0_test <- predict(irisc5.0_train,newdata=iris_test) # predicting on test data
mean(predc5.0_test==iris_test$Species) # 94.66% accuracy
## [1] 0.9466667
library(gmodels)
## Warning: package 'gmodels' was built under R version 3.5.1
# Cross tablez
CrossTable(iris_test$Species,predc5.0_test)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 75
##
##
## | predc5.0_test
## iris_test$Species | setosa | versicolor | virginica | Row Total |
## ------------------|------------|------------|------------|------------|
## setosa | 25 | 0 | 0 | 25 |
## | 33.333 | 9.000 | 7.667 | |
## | 1.000 | 0.000 | 0.000 | 0.333 |
## | 1.000 | 0.000 | 0.000 | |
## | 0.333 | 0.000 | 0.000 | |
## ------------------|------------|------------|------------|------------|
## versicolor | 0 | 24 | 1 | 25 |
## | 8.333 | 25.000 | 5.797 | |
## | 0.000 | 0.960 | 0.040 | 0.333 |
## | 0.000 | 0.889 | 0.043 | |
## | 0.000 | 0.320 | 0.013 | |
## ------------------|------------|------------|------------|------------|
## virginica | 0 | 3 | 22 | 25 |
## | 8.333 | 4.000 | 26.797 | |
## | 0.000 | 0.120 | 0.880 | 0.333 |
## | 0.000 | 0.111 | 0.957 | |
## | 0.000 | 0.040 | 0.293 | |
## ------------------|------------|------------|------------|------------|
## Column Total | 25 | 27 | 23 | 75 |
## | 0.333 | 0.360 | 0.307 | |
## ------------------|------------|------------|------------|------------|
##
##
##### Using tree function
library(tree)
## Warning: package 'tree' was built under R version 3.5.1
# Building a model on training data
iris_tree <- tree(Species~.,data=iris_train)
plot(iris_tree)
text(iris_tree,pretty = 0)

# Predicting the test data using the model
pred_tree <- as.data.frame(predict(iris_tree,newdata=iris_test))
pred_tree["final"] <- NULL
for (i in 1:nrow(pred_tree)){
pred_tree[i,"final"]<-ifelse(pred_tree[i,"setosa"]>0.5,"setosa",ifelse(pred_tree[i,"versicolor"]>0.5,"versicolor","virginica"))
}
mean(pred_tree$final==iris_test$Species) # Accuracy = 94.66%
## [1] 0.9466667
CrossTable(iris_test$Species,pred_tree$final)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 75
##
##
## | pred_tree$final
## iris_test$Species | setosa | versicolor | virginica | Row Total |
## ------------------|------------|------------|------------|------------|
## setosa | 25 | 0 | 0 | 25 |
## | 33.333 | 9.000 | 7.667 | |
## | 1.000 | 0.000 | 0.000 | 0.333 |
## | 1.000 | 0.000 | 0.000 | |
## | 0.333 | 0.000 | 0.000 | |
## ------------------|------------|------------|------------|------------|
## versicolor | 0 | 24 | 1 | 25 |
## | 8.333 | 25.000 | 5.797 | |
## | 0.000 | 0.960 | 0.040 | 0.333 |
## | 0.000 | 0.889 | 0.043 | |
## | 0.000 | 0.320 | 0.013 | |
## ------------------|------------|------------|------------|------------|
## virginica | 0 | 3 | 22 | 25 |
## | 8.333 | 4.000 | 26.797 | |
## | 0.000 | 0.120 | 0.880 | 0.333 |
## | 0.000 | 0.111 | 0.957 | |
## | 0.000 | 0.040 | 0.293 | |
## ------------------|------------|------------|------------|------------|
## Column Total | 25 | 27 | 23 | 75 |
## | 0.333 | 0.360 | 0.307 | |
## ------------------|------------|------------|------------|------------|
##
##