data mining

#install.packages("C50") # we neeed to install C50 package to use ak
#install.packages("tree")
library(C50)
## Warning: package 'C50' was built under R version 3.5.1
data()

# Splitting data into training and testing. As the species are in order 
# splitting the data based on species 
iris_setosa<-iris[iris$Species=="setosa",] # 50
iris_versicolor <- iris[iris$Species=="versicolor",] # 50
iris_virginica <- iris[iris$Species=="virginica",] # 50
iris_train <- rbind(iris_setosa[1:25,],iris_versicolor[1:25,],iris_virginica[1:25,])
iris_test <- rbind(iris_setosa[26:50,],iris_versicolor[26:50,],iris_virginica[26:50,])


# Building model on training data 
irisc5.0_train <- C5.0(iris_train[,-5],iris_train$Species)
plot(irisc5.0_train) # Tree graph

# Training accuracy
mean(iris_train$Species==predict(irisc5.0_train,iris_train)) # 97.33% Accuracy
## [1] 0.9733333
predc5.0_test <- predict(irisc5.0_train,newdata=iris_test) # predicting on test data
mean(predc5.0_test==iris_test$Species) # 94.66% accuracy 
## [1] 0.9466667
library(gmodels)
## Warning: package 'gmodels' was built under R version 3.5.1
# Cross tablez
CrossTable(iris_test$Species,predc5.0_test)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  75 
## 
##  
##                   | predc5.0_test 
## iris_test$Species |     setosa | versicolor |  virginica |  Row Total | 
## ------------------|------------|------------|------------|------------|
##            setosa |         25 |          0 |          0 |         25 | 
##                   |     33.333 |      9.000 |      7.667 |            | 
##                   |      1.000 |      0.000 |      0.000 |      0.333 | 
##                   |      1.000 |      0.000 |      0.000 |            | 
##                   |      0.333 |      0.000 |      0.000 |            | 
## ------------------|------------|------------|------------|------------|
##        versicolor |          0 |         24 |          1 |         25 | 
##                   |      8.333 |     25.000 |      5.797 |            | 
##                   |      0.000 |      0.960 |      0.040 |      0.333 | 
##                   |      0.000 |      0.889 |      0.043 |            | 
##                   |      0.000 |      0.320 |      0.013 |            | 
## ------------------|------------|------------|------------|------------|
##         virginica |          0 |          3 |         22 |         25 | 
##                   |      8.333 |      4.000 |     26.797 |            | 
##                   |      0.000 |      0.120 |      0.880 |      0.333 | 
##                   |      0.000 |      0.111 |      0.957 |            | 
##                   |      0.000 |      0.040 |      0.293 |            | 
## ------------------|------------|------------|------------|------------|
##      Column Total |         25 |         27 |         23 |         75 | 
##                   |      0.333 |      0.360 |      0.307 |            | 
## ------------------|------------|------------|------------|------------|
## 
## 
##### Using tree function 
library(tree)
## Warning: package 'tree' was built under R version 3.5.1
# Building a model on training data 
iris_tree <- tree(Species~.,data=iris_train)
plot(iris_tree)
text(iris_tree,pretty = 0)

# Predicting the test data using the model
pred_tree <- as.data.frame(predict(iris_tree,newdata=iris_test))
pred_tree["final"] <- NULL

for (i in 1:nrow(pred_tree)){
  pred_tree[i,"final"]<-ifelse(pred_tree[i,"setosa"]>0.5,"setosa",ifelse(pred_tree[i,"versicolor"]>0.5,"versicolor","virginica"))
}
mean(pred_tree$final==iris_test$Species) # Accuracy = 94.66%
## [1] 0.9466667
CrossTable(iris_test$Species,pred_tree$final)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  75 
## 
##  
##                   | pred_tree$final 
## iris_test$Species |     setosa | versicolor |  virginica |  Row Total | 
## ------------------|------------|------------|------------|------------|
##            setosa |         25 |          0 |          0 |         25 | 
##                   |     33.333 |      9.000 |      7.667 |            | 
##                   |      1.000 |      0.000 |      0.000 |      0.333 | 
##                   |      1.000 |      0.000 |      0.000 |            | 
##                   |      0.333 |      0.000 |      0.000 |            | 
## ------------------|------------|------------|------------|------------|
##        versicolor |          0 |         24 |          1 |         25 | 
##                   |      8.333 |     25.000 |      5.797 |            | 
##                   |      0.000 |      0.960 |      0.040 |      0.333 | 
##                   |      0.000 |      0.889 |      0.043 |            | 
##                   |      0.000 |      0.320 |      0.013 |            | 
## ------------------|------------|------------|------------|------------|
##         virginica |          0 |          3 |         22 |         25 | 
##                   |      8.333 |      4.000 |     26.797 |            | 
##                   |      0.000 |      0.120 |      0.880 |      0.333 | 
##                   |      0.000 |      0.111 |      0.957 |            | 
##                   |      0.000 |      0.040 |      0.293 |            | 
## ------------------|------------|------------|------------|------------|
##      Column Total |         25 |         27 |         23 |         75 | 
##                   |      0.333 |      0.360 |      0.307 |            | 
## ------------------|------------|------------|------------|------------|
## 
##