Decision Tree

Importing Libray

library(rpart)
library(rattle)
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart.plot)
library(RColorBrewer)

Loading Data into the dataframe from CSV

dataframe <- read.csv("~/AI/DataSet/DecisionTreeDataSet.csv",header=T, na.strings=c("","NA"))
names(dataframe)
## [1] "Temperature" "Outlook"     "Humidity"    "Windy"       "Golf"

Seperating Test and Train Data

train <- na.omit(dataframe)
test <- dataframe[is.na(dataframe$Golf),]

Visualizing train data

train

Visualizing test data where golf is to be predicted by decision tree later

test

Initialling rpart decision tree with the train data

params = By default, rpart uses gini impurity to select splits when performing classification.You can use information gain instead by specifying it in the parms parameter.

method = one of “anova”, “poisson”, “class” or “exp”. If method is missing then the routine tries to make an intelligent guess. If y is a survival object, then method = “exp” is assumed, if y has 2 columns then method = “poisson” is assumed, if y is a factor then method = “class” is assumed, otherwise method = “anova” is assumed. It is wisest to specify the method directly, especially as more criteria may added to the function in future.

minsplit = number of observations that must exist in a node in order for a split to be attempted

minbucket = the minimum number of observations in any terminal node

tree <- rpart(
  Golf~ Temperature + Outlook + Humidity + Windy, data=train, 
  method = "class",
  parms = list(split = 'information'), 
  minsplit = 2, 
  minbucket = 1
)

Tree Summary

summary(tree)
## Call:
## rpart(formula = Golf ~ Temperature + Outlook + Humidity + Windy, 
##     data = train, method = "class", parms = list(split = "information"), 
##     minsplit = 2, minbucket = 1)
##   n= 14 
## 
##     CP nsplit rel error xerror      xstd
## 1 0.30      0       1.0    1.0 0.3585686
## 2 0.10      2       0.4    1.6 0.3703280
## 3 0.01      6       0.0    1.4 0.3741657
## 
## Variable importance
##     Outlook Temperature       Windy    Humidity 
##          35          24          23          18 
## 
## Node number 1: 14 observations,    complexity param=0.3
##   predicted class=Yes  expected loss=0.3571429  P(node) =1
##     class counts:     5     9
##    probabilities: 0.357 0.643 
##   left son=2 (10 obs) right son=3 (4 obs)
##   Primary splits:
##       Outlook     splits as  RLL, improve=2.1931200, (0 missing)
##       Humidity    splits as  LR,  improve=1.4734210, (0 missing)
##       Windy       splits as  RL,  improve=0.4670276, (0 missing)
##       Temperature splits as  RLR, improve=0.2433601, (0 missing)
## 
## Node number 2: 10 observations,    complexity param=0.3
##   predicted class=No   expected loss=0.5  P(node) =0.7142857
##     class counts:     5     5
##    probabilities: 0.500 0.500 
##   left son=4 (5 obs) right son=5 (5 obs)
##   Primary splits:
##       Humidity    splits as  LR,  improve=1.9274480, (0 missing)
##       Temperature splits as  RLR, improve=1.6389660, (0 missing)
##       Windy       splits as  RL,  improve=0.8630462, (0 missing)
##       Outlook     splits as  -RL, improve=0.2013551, (0 missing)
##   Surrogate splits:
##       Temperature splits as  RLL, agree=0.8, adj=0.6, (0 split)
##       Outlook     splits as  -RL, agree=0.6, adj=0.2, (0 split)
## 
## Node number 3: 4 observations
##   predicted class=Yes  expected loss=0  P(node) =0.2857143
##     class counts:     0     4
##    probabilities: 0.000 1.000 
## 
## Node number 4: 5 observations,    complexity param=0.1
##   predicted class=No   expected loss=0.2  P(node) =0.3571429
##     class counts:     4     1
##    probabilities: 0.800 0.200 
##   left son=8 (3 obs) right son=9 (2 obs)
##   Primary splits:
##       Outlook     splits as  -RL, improve=1.1157180, (0 missing)
##       Temperature splits as  -LR, improve=0.5924696, (0 missing)
##       Windy       splits as  RL,  improve=0.5924696, (0 missing)
## 
## Node number 5: 5 observations,    complexity param=0.1
##   predicted class=Yes  expected loss=0.2  P(node) =0.3571429
##     class counts:     1     4
##    probabilities: 0.200 0.800 
##   left son=10 (2 obs) right son=11 (3 obs)
##   Primary splits:
##       Windy       splits as  RL,  improve=1.1157180, (0 missing)
##       Temperature splits as  L-R, improve=0.5924696, (0 missing)
##       Outlook     splits as  -LR, improve=0.5924696, (0 missing)
## 
## Node number 8: 3 observations
##   predicted class=No   expected loss=0  P(node) =0.2142857
##     class counts:     3     0
##    probabilities: 1.000 0.000 
## 
## Node number 9: 2 observations,    complexity param=0.1
##   predicted class=No   expected loss=0.5  P(node) =0.1428571
##     class counts:     1     1
##    probabilities: 0.500 0.500 
##   left son=18 (1 obs) right son=19 (1 obs)
##   Primary splits:
##       Windy splits as  RL, improve=1.386294, (0 missing)
## 
## Node number 10: 2 observations,    complexity param=0.1
##   predicted class=No   expected loss=0.5  P(node) =0.1428571
##     class counts:     1     1
##    probabilities: 0.500 0.500 
##   left son=20 (1 obs) right son=21 (1 obs)
##   Primary splits:
##       Temperature splits as  L-R, improve=1.386294, (0 missing)
##       Outlook     splits as  -LR, improve=1.386294, (0 missing)
## 
## Node number 11: 3 observations
##   predicted class=Yes  expected loss=0  P(node) =0.2142857
##     class counts:     0     3
##    probabilities: 0.000 1.000 
## 
## Node number 18: 1 observations
##   predicted class=No   expected loss=0  P(node) =0.07142857
##     class counts:     1     0
##    probabilities: 1.000 0.000 
## 
## Node number 19: 1 observations
##   predicted class=Yes  expected loss=0  P(node) =0.07142857
##     class counts:     0     1
##    probabilities: 0.000 1.000 
## 
## Node number 20: 1 observations
##   predicted class=No   expected loss=0  P(node) =0.07142857
##     class counts:     1     0
##    probabilities: 1.000 0.000 
## 
## Node number 21: 1 observations
##   predicted class=Yes  expected loss=0  P(node) =0.07142857
##     class counts:     0     1
##    probabilities: 0.000 1.000

Plotting Decision Tree

# plot tree
fancyRpartPlot(tree, caption = "Decision Tree")

Seeing the importance variable

tree$variable.importance
##     Outlook Temperature       Windy    Humidity 
##    3.694327    2.542763    2.502012    1.927448

Predicting two missing golf value and adding back to test table

tree.pred <- predict(tree, test, type = "class")

test$Golf = tree.pred
test