err = seq(0.0, 1.0, .01)
m = 1 - err
err.fun=function(a){
b = 1 - a
return(1-max(a,b))
}
class.error = sapply(err, err.fun)
gini = ( ( err * (1 - err) ) + ( m * (1 - m) ) )
entropy = ( - ( (err * log2(err) ) + ( m * log2(m) ) ) )
plot(err, entropy, typ = "l", xlab ="", ylab = "")
lines(err, gini, col = "blue")
lines(err, class.error, col = "red")
library(ISLR)
set.seed(42)
train = sample(nrow(Carseats), .8*(nrow(Carseats)) )
car.train = Carseats[train,]
car.test = Carseats[-train,]
library(tree)
set.seed(42)
car.tree = tree(Sales ~., car.train)
plot(car.tree)
text(car.tree,pretty=0)
tree.pred = predict(car.tree, newdata = car.test)
mse = mean((tree.pred - car.test$Sales)^2)
mse
## [1] 3.906465
MSE value is 3.906
cv.car = cv.tree(car.tree)
plot(cv.car$size, cv.car$dev, type = "b")
prune.car = prune.tree(car.tree, best = 5)
plot(prune.car)
text(prune.car, pretty = 0)
tree.pred = predict(prune.car, newdata = car.test)
mse1 = mean((tree.pred - car.test$Sales)^2)
mse1
## [1] 5.084432
We observe that when we do pruning the tree it increases MSE value. ### (d) Random forest
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
set.seed(42)
bag.car = randomForest(Sales ~., car.train, importance = TRUE, ntree = 500, mtry = 10)
tree.pred = predict(bag.car, newdata = car.test)
mse2 = mean((tree.pred - car.test$Sales)^2)
mse2
## [1] 2.247194
importance(bag.car)
## %IncMSE IncNodePurity
## CompPrice 37.2655918 272.684989
## Income 11.3481020 139.099550
## Advertising 24.7312015 187.974015
## Population -0.8309134 84.799869
## Price 74.6565917 742.426277
## ShelveLoc 76.1027278 688.088937
## Age 22.3794426 239.876755
## Education 2.8933345 64.433523
## Urban 0.3376574 9.737118
## US 1.6649153 10.587431
To use the bagging method, we have m = p. We see MSE value from the model as 2.2472, which is less than previous mse values. Also we significant variables are CompPrice, Income and Advertising.
rf.car = randomForest(Sales ~., car.train, importance = TRUE, ntree = 500, mtry = 3)
tree.pred = predict(rf.car, newdata = car.test)
mse3 = mean((tree.pred - car.test$Sales)^2)
mse3
## [1] 2.835609
importance(rf.car)
## %IncMSE IncNodePurity
## CompPrice 16.5827316 225.67910
## Income 6.3862954 193.16698
## Advertising 17.5873572 218.29447
## Population 1.4117218 155.91267
## Price 44.9860497 581.88181
## ShelveLoc 49.9640240 538.27366
## Age 15.2400202 271.29274
## Education 0.8508437 111.00186
## Urban -2.1433634 22.38776
## US 3.8134550 34.28175
In this random forest model we have m equal to sqrt(p). MSE values increases compared to b) and c); with bagging method it is lower.
OJ.train = OJ[-train,]
OJ.test = OJ[train,]
set.seed(42)
OJ.tree = tree(Purchase ~., data = OJ.train)
summary(OJ.tree)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ.train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "SalePriceMM" "PriceDiff"
## Number of terminal nodes: 8
## Residual mean deviance: 0.7681 = 569.9 / 742
## Misclassification error rate: 0.1627 = 122 / 750
Classification tree has 8 terminal nodes and error rate is 16.27%
OJ.tree
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 750 1025.00 CH ( 0.57067 0.42933 )
## 2) LoyalCH < 0.48285 310 331.20 MM ( 0.22581 0.77419 )
## 4) LoyalCH < 0.276142 180 125.60 MM ( 0.11111 0.88889 )
## 8) LoyalCH < 0.0356415 60 10.17 MM ( 0.01667 0.98333 ) *
## 9) LoyalCH > 0.0356415 120 104.90 MM ( 0.15833 0.84167 ) *
## 5) LoyalCH > 0.276142 130 173.20 MM ( 0.38462 0.61538 )
## 10) SalePriceMM < 2.04 75 80.28 MM ( 0.22667 0.77333 ) *
## 11) SalePriceMM > 2.04 55 74.03 CH ( 0.60000 0.40000 ) *
## 3) LoyalCH > 0.48285 440 423.20 CH ( 0.81364 0.18636 )
## 6) LoyalCH < 0.764572 245 301.80 CH ( 0.69388 0.30612 )
## 12) PriceDiff < 0.265 143 197.40 CH ( 0.53846 0.46154 )
## 24) PriceDiff < -0.165 37 41.05 MM ( 0.24324 0.75676 ) *
## 25) PriceDiff > -0.165 106 138.30 CH ( 0.64151 0.35849 ) *
## 13) PriceDiff > 0.265 102 60.88 CH ( 0.91176 0.08824 ) *
## 7) LoyalCH > 0.764572 195 60.32 CH ( 0.96410 0.03590 ) *
plot(OJ.tree)
text(OJ.tree,pretty=0)
We have noted the node categorized as 25. It has criteria of PriceDiff > -0.165, number of observations 106 with deviance of 138.30. Once observation moves to this node 25, the prediction will be CH and 64.15% observations in this node are CH.
plot(OJ.tree)
text(OJ.tree,pretty=0)
1st and 2nd variables used to create tree are Loyal CH and it could be pointers to why they are important variables in this model
set.seed(42)
OJ.pred = predict(OJ.tree, newdata = OJ.test, type = "class")
table(OJ.test$Purchase, OJ.pred)
## OJ.pred
## CH MM
## CH 209 16
## MM 36 59
error = 1 - (209+59)/320
error
## [1] 0.1625
Missclassification error rate is 16.25%
set.seed(42)
cv.OJ = cv.tree(OJ.tree, FUN = prune.misclass)
cv.OJ
## $size
## [1] 8 7 5 2 1
##
## $dev
## [1] 151 149 149 154 322
##
## $k
## [1] -Inf 0.000000 5.500000 6.333333 170.000000
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(cv.OJ$size, cv.OJ$dev, type = "b", xlab = "Tree size", ylab = "Deviance")
bestsize = cv.OJ$size[which.min(cv.OJ$dev)]
bestsize
## [1] 7
Best size as per CV function is 7 however we see in the plot that 5 has similar deviance hence we will select 5
prune.OJ = prune.misclass(OJ.tree, best = 5)
plot(prune.OJ)
text(prune.OJ, pretty = 0)
prune.pred = predict(prune.OJ, newdata = OJ.train, type = "class")
table(OJ.train$Purchase, prune.pred)
## prune.pred
## CH MM
## CH 349 79
## MM 54 268
error.prune1 = 1 - (349+268)/750
OJ.pred = predict(OJ.tree, newdata = OJ.train, type = "class")
table(OJ.train$Purchase, OJ.pred)
## OJ.pred
## CH MM
## CH 382 46
## MM 76 246
error1 = 1 - (382+246)/750
error.prune1
## [1] 0.1773333
error1
## [1] 0.1626667
We observe that missclassifation rate is higher for pruned tree
prune.pred = predict(prune.OJ, newdata = OJ.test, type = "class")
table(OJ.test$Purchase, prune.pred)
## prune.pred
## CH MM
## CH 198 27
## MM 22 73
error.prune = 1 - (198+73)/320
error.prune
## [1] 0.153125
error
## [1] 0.1625
We observe that error rate in case of pruned tree is lower than above.