# Decision Tree
#install.packages("RWeka")
library("RWeka")
## java.home option:
## JAVA_HOME environment variable: JDK
## Warning in fun(libname, pkgname): Java home setting is INVALID, it will be ignored.
## Please do NOT set it unless you want to override system settings.
data <- read.arff("C:/Users/DINESHKUMAR/OneDrive/Desktop/php7gmqTJ.arff")
View(data)
str(data)
## 'data.frame': 100 obs. of 6 variables:
## $ V1 : Factor w/ 3 levels "high","low","medium": 1 1 3 1 3 3 1 1 1 3 ...
## $ V2 : Factor w/ 3 levels "left","middle",..: 1 1 2 1 2 2 1 3 3 3 ...
## $ V3 : Factor w/ 5 levels "impression","news",..: 1 3 5 3 2 2 3 3 3 5 ...
## $ V4 : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ V5 : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 1 1 1 ...
## $ Class: Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 1 2 ...
#Creating Train and Test Set
library(caTools)
##
## Attaching package: 'caTools'
## The following object is masked from 'package:RWeka':
##
## LogitBoost
set.seed(123)
split=sample.split(Y=data$Class,SplitRatio=2/3)
train_set=subset(x=data,split==TRUE)
test_set=subset(x=data,split==FALSE)
dim(train_set)
## [1] 66 6
dim(test_set)
## [1] 34 6
#Building a Model
library(rpart)
fit=rpart(formula=Class~.,data=train_set,method="class")
summary(fit)
## Call:
## rpart(formula = Class ~ ., data = train_set, method = "class")
## n= 66
##
## CP nsplit rel error xerror xstd
## 1 0.19047619 0 1.0000000 1.000000 0.1801875
## 2 0.01190476 1 0.8095238 1.238095 0.1890277
## 3 0.01000000 5 0.7619048 1.238095 0.1890277
##
## Variable importance
## V1 V3 V2 V5 V4
## 33 26 26 9 6
##
## Node number 1: 66 observations, complexity param=0.1904762
## predicted class=2 expected loss=0.3181818 P(node) =1
## class counts: 21 45
## probabilities: 0.318 0.682
## left son=2 (8 obs) right son=3 (58 obs)
## Primary splits:
## V1 splits as RLR, improve=3.39498400, (0 missing)
## V3 splits as RRRLL, improve=2.31161600, (0 missing)
## V2 splits as RRL, improve=1.89723300, (0 missing)
## V4 splits as RL, improve=0.67946710, (0 missing)
## V5 splits as RL, improve=0.06493506, (0 missing)
##
## Node number 2: 8 observations
## predicted class=1 expected loss=0.25 P(node) =0.1212121
## class counts: 6 2
## probabilities: 0.750 0.250
##
## Node number 3: 58 observations, complexity param=0.01190476
## predicted class=2 expected loss=0.2586207 P(node) =0.8787879
## class counts: 15 43
## probabilities: 0.259 0.741
## left son=6 (16 obs) right son=7 (42 obs)
## Primary splits:
## V2 splits as RRL, improve=2.57471300, (0 missing)
## V5 splits as RL, improve=1.49099200, (0 missing)
## V3 splits as RRRLL, improve=1.41399800, (0 missing)
## V4 splits as RL, improve=0.21336810, (0 missing)
## V1 splits as R-L, improve=0.03047022, (0 missing)
## Surrogate splits:
## V3 splits as RRRRL, agree=0.776, adj=0.187, (0 split)
##
## Node number 6: 16 observations
## predicted class=1 expected loss=0.5 P(node) =0.2424242
## class counts: 8 8
## probabilities: 0.500 0.500
##
## Node number 7: 42 observations, complexity param=0.01190476
## predicted class=2 expected loss=0.1666667 P(node) =0.6363636
## class counts: 7 35
## probabilities: 0.167 0.833
## left son=14 (29 obs) right son=15 (13 obs)
## Primary splits:
## V3 splits as LRLLR, improve=1.04597700, (0 missing)
## V5 splits as RL, improve=0.82795700, (0 missing)
## V1 splits as L-R, improve=0.19444440, (0 missing)
## V2 splits as LR-, improve=0.00952381, (0 missing)
## V4 splits as RL, improve=0.00952381, (0 missing)
## Surrogate splits:
## V2 splits as LR-, agree=0.714, adj=0.077, (0 split)
##
## Node number 14: 29 observations, complexity param=0.01190476
## predicted class=2 expected loss=0.2413793 P(node) =0.4393939
## class counts: 7 22
## probabilities: 0.241 0.759
## left son=28 (22 obs) right son=29 (7 obs)
## Primary splits:
## V5 splits as RL, improve=1.075235000, (0 missing)
## V3 splits as R-LL-, improve=0.125740200, (0 missing)
## V1 splits as R-L, improve=0.003042596, (0 missing)
## Surrogate splits:
## V4 splits as RL, agree=0.862, adj=0.429, (0 split)
## V2 splits as LR-, agree=0.793, adj=0.143, (0 split)
## V3 splits as R-LL-, agree=0.793, adj=0.143, (0 split)
##
## Node number 15: 13 observations
## predicted class=2 expected loss=0 P(node) =0.1969697
## class counts: 0 13
## probabilities: 0.000 1.000
##
## Node number 28: 22 observations, complexity param=0.01190476
## predicted class=2 expected loss=0.3181818 P(node) =0.3333333
## class counts: 7 15
## probabilities: 0.318 0.682
## left son=56 (7 obs) right son=57 (15 obs)
## Primary splits:
## V3 splits as L-RL-, improve=1.31688300, (0 missing)
## V1 splits as R-L, improve=0.08116883, (0 missing)
## Surrogate splits:
## V1 splits as R-L, agree=0.773, adj=0.286, (0 split)
## V2 splits as RL-, agree=0.727, adj=0.143, (0 split)
## V4 splits as LR, agree=0.727, adj=0.143, (0 split)
##
## Node number 29: 7 observations
## predicted class=2 expected loss=0 P(node) =0.1060606
## class counts: 0 7
## probabilities: 0.000 1.000
##
## Node number 56: 7 observations
## predicted class=1 expected loss=0.4285714 P(node) =0.1060606
## class counts: 4 3
## probabilities: 0.571 0.429
##
## Node number 57: 15 observations
## predicted class=2 expected loss=0.2 P(node) =0.2272727
## class counts: 3 12
## probabilities: 0.200 0.800
str(data)
## 'data.frame': 100 obs. of 6 variables:
## $ V1 : Factor w/ 3 levels "high","low","medium": 1 1 3 1 3 3 1 1 1 3 ...
## $ V2 : Factor w/ 3 levels "left","middle",..: 1 1 2 1 2 2 1 3 3 3 ...
## $ V3 : Factor w/ 5 levels "impression","news",..: 1 3 5 3 2 2 3 3 3 5 ...
## $ V4 : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ V5 : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 1 1 1 ...
## $ Class: Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 1 2 ...
library(rpart.plot)
rpart.plot(fit)

predict_unseen=predict(object=fit,newdata=test_set,type="class")
predict_unseen
## 2 4 5 8 9 12 14 19 23 26 27 28 31 35 43 45 46 47 49 50
## 2 2 2 1 1 2 1 1 1 2 1 1 1 2 2 2 2 1 1 1
## 53 64 69 70 73 75 81 82 84 88 93 94 96 100
## 2 1 2 1 1 2 1 1 1 1 2 1 1 1
## Levels: 1 2
# Confusion Matrix
cmat=table(test_set$Class,predict_unseen)
cmat
## predict_unseen
## 1 2
## 1 11 0
## 2 10 13
#Accuracy
sum(diag(cmat))/sum(cmat)
## [1] 0.7058824
plot(fit)
text(fit)
