Importing Libray
library(rpart)
library(rattle)
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart.plot)
library(RColorBrewer)
Loading Data into the dataframe from CSV
dataframe <- read.csv("~/AI/DataSet/DecisionTreeDataSet.csv",header=T, na.strings=c("","NA"))
names(dataframe)
## [1] "Temperature" "Outlook" "Humidity" "Windy" "Golf"
Seperating Test and Train Data
train <- na.omit(dataframe)
test <- dataframe[is.na(dataframe$Golf),]
Visualizing train data
train
Visualizing test data where golf is to be predicted by decision tree later
test
Initialling rpart decision tree with the train data
params = By default, rpart uses gini impurity to select splits when performing classification.You can use information gain instead by specifying it in the parms parameter.
method = one of “anova”, “poisson”, “class” or “exp”. If method is missing then the routine tries to make an intelligent guess. If y is a survival object, then method = “exp” is assumed, if y has 2 columns then method = “poisson” is assumed, if y is a factor then method = “class” is assumed, otherwise method = “anova” is assumed. It is wisest to specify the method directly, especially as more criteria may added to the function in future.
minsplit = number of observations that must exist in a node in order for a split to be attempted
minbucket = the minimum number of observations in any terminal node
tree <- rpart(
Golf~ Temperature + Outlook + Humidity + Windy, data=train,
method = "class",
parms = list(split = 'information'),
minsplit = 2,
minbucket = 1
)
Tree Summary
summary(tree)
## Call:
## rpart(formula = Golf ~ Temperature + Outlook + Humidity + Windy,
## data = train, method = "class", parms = list(split = "information"),
## minsplit = 2, minbucket = 1)
## n= 14
##
## CP nsplit rel error xerror xstd
## 1 0.30 0 1.0 1.0 0.3585686
## 2 0.10 2 0.4 1.6 0.3703280
## 3 0.01 6 0.0 1.4 0.3741657
##
## Variable importance
## Outlook Temperature Windy Humidity
## 35 24 23 18
##
## Node number 1: 14 observations, complexity param=0.3
## predicted class=Yes expected loss=0.3571429 P(node) =1
## class counts: 5 9
## probabilities: 0.357 0.643
## left son=2 (10 obs) right son=3 (4 obs)
## Primary splits:
## Outlook splits as RLL, improve=2.1931200, (0 missing)
## Humidity splits as LR, improve=1.4734210, (0 missing)
## Windy splits as RL, improve=0.4670276, (0 missing)
## Temperature splits as RLR, improve=0.2433601, (0 missing)
##
## Node number 2: 10 observations, complexity param=0.3
## predicted class=No expected loss=0.5 P(node) =0.7142857
## class counts: 5 5
## probabilities: 0.500 0.500
## left son=4 (5 obs) right son=5 (5 obs)
## Primary splits:
## Humidity splits as LR, improve=1.9274480, (0 missing)
## Temperature splits as RLR, improve=1.6389660, (0 missing)
## Windy splits as RL, improve=0.8630462, (0 missing)
## Outlook splits as -RL, improve=0.2013551, (0 missing)
## Surrogate splits:
## Temperature splits as RLL, agree=0.8, adj=0.6, (0 split)
## Outlook splits as -RL, agree=0.6, adj=0.2, (0 split)
##
## Node number 3: 4 observations
## predicted class=Yes expected loss=0 P(node) =0.2857143
## class counts: 0 4
## probabilities: 0.000 1.000
##
## Node number 4: 5 observations, complexity param=0.1
## predicted class=No expected loss=0.2 P(node) =0.3571429
## class counts: 4 1
## probabilities: 0.800 0.200
## left son=8 (3 obs) right son=9 (2 obs)
## Primary splits:
## Outlook splits as -RL, improve=1.1157180, (0 missing)
## Temperature splits as -LR, improve=0.5924696, (0 missing)
## Windy splits as RL, improve=0.5924696, (0 missing)
##
## Node number 5: 5 observations, complexity param=0.1
## predicted class=Yes expected loss=0.2 P(node) =0.3571429
## class counts: 1 4
## probabilities: 0.200 0.800
## left son=10 (2 obs) right son=11 (3 obs)
## Primary splits:
## Windy splits as RL, improve=1.1157180, (0 missing)
## Temperature splits as L-R, improve=0.5924696, (0 missing)
## Outlook splits as -LR, improve=0.5924696, (0 missing)
##
## Node number 8: 3 observations
## predicted class=No expected loss=0 P(node) =0.2142857
## class counts: 3 0
## probabilities: 1.000 0.000
##
## Node number 9: 2 observations, complexity param=0.1
## predicted class=No expected loss=0.5 P(node) =0.1428571
## class counts: 1 1
## probabilities: 0.500 0.500
## left son=18 (1 obs) right son=19 (1 obs)
## Primary splits:
## Windy splits as RL, improve=1.386294, (0 missing)
##
## Node number 10: 2 observations, complexity param=0.1
## predicted class=No expected loss=0.5 P(node) =0.1428571
## class counts: 1 1
## probabilities: 0.500 0.500
## left son=20 (1 obs) right son=21 (1 obs)
## Primary splits:
## Temperature splits as L-R, improve=1.386294, (0 missing)
## Outlook splits as -LR, improve=1.386294, (0 missing)
##
## Node number 11: 3 observations
## predicted class=Yes expected loss=0 P(node) =0.2142857
## class counts: 0 3
## probabilities: 0.000 1.000
##
## Node number 18: 1 observations
## predicted class=No expected loss=0 P(node) =0.07142857
## class counts: 1 0
## probabilities: 1.000 0.000
##
## Node number 19: 1 observations
## predicted class=Yes expected loss=0 P(node) =0.07142857
## class counts: 0 1
## probabilities: 0.000 1.000
##
## Node number 20: 1 observations
## predicted class=No expected loss=0 P(node) =0.07142857
## class counts: 1 0
## probabilities: 1.000 0.000
##
## Node number 21: 1 observations
## predicted class=Yes expected loss=0 P(node) =0.07142857
## class counts: 0 1
## probabilities: 0.000 1.000
Plotting Decision Tree
# plot tree
fancyRpartPlot(tree, caption = "Decision Tree")
Seeing the importance variable
tree$variable.importance
## Outlook Temperature Windy Humidity
## 3.694327 2.542763 2.502012 1.927448
Predicting two missing golf value and adding back to test table
tree.pred <- predict(tree, test, type = "class")
test$Golf = tree.pred
test