library(ggplot2)
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.3
library(tree)
## Warning: package 'tree' was built under R version 4.0.5
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.0.5
library(randomForestSRC)
## Warning: package 'randomForestSRC' was built under R version 4.0.5
##
## randomForestSRC 2.11.0
##
## Type rfsrc.news() to see new features, changes, and bug fixes.
##
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
p <- seq(0, 1, 0.01)
class<- 1 - pmax(p, 1 - p)
gini <- 2 * p * (1 - p)
entropy <- - (p * log(p) + (1 - p) * log(1 - p))
errordf<- data.frame(p, class, gini, entropy)
ggplot( errordf ,aes(x = p)) +
geom_line(aes(y = class, colour = "class")) +
geom_line(aes(y = gini, colour = "gini")) +
geom_line(aes(y = entropy, colour = "entropy"))
## Warning: Removed 2 row(s) containing missing values (geom_path).
8. In the lab, a classification tree was applied to the Carseats data set after converting Sales into a qualitative response variable. Now we will seek to predict Sales using regression trees and related approaches, treating the response as a quantitative variable.
cardata <- Carseats
set.seed(456)
car.train_ind = sample(seq_len(nrow(cardata)), size = round(nrow(cardata)*.8))
car.train = cardata[car.train_ind,]
car.test = cardata[-car.train_ind,]
set.seed(123)
tree <- rpart(Sales ~. , data = car.train, method = 'anova')
rpart.plot(tree)
tree.pred <- predict(tree, car.test)
tree.pred
## 1 6 15 16 18 21 24 28
## 4.717576 9.185714 9.225385 2.941250 9.225385 5.413333 6.577576 6.577576
## 32 41 52 55 57 58 67 72
## 7.116111 4.717576 4.717576 7.116111 10.194583 4.717576 7.484583 7.116111
## 76 77 80 84 86 91 92 93
## 10.194583 9.185714 10.194583 4.807273 6.577576 7.484583 6.347059 6.577576
## 96 102 110 115 116 124 128 134
## 7.116111 6.577576 7.484583 6.347059 5.413333 7.310000 6.577576 4.807273
## 138 143 147 158 173 178 184 199
## 6.577576 7.484583 4.717576 8.080000 9.225385 10.194583 7.484583 2.941250
## 208 212 214 219 224 225 233 235
## 4.807273 6.347059 5.413333 8.898667 6.347059 5.413333 13.622857 9.225385
## 236 246 247 251 253 259 264 267
## 7.116111 11.236250 4.807273 7.310000 6.577576 9.185714 6.577576 9.225385
## 273 274 275 276 279 288 289 317
## 11.236250 9.185714 6.577576 6.347059 7.310000 8.080000 7.484583 11.236250
## 320 323 328 329 342 343 347 355
## 7.116111 7.310000 7.484583 4.717576 7.484583 8.898667 6.577576 5.413333
## 363 365 368 370 371 375 383 399
## 4.717576 12.098750 11.236250 8.898667 4.717576 8.898667 6.347059 10.194583
car.test$Sales
## [1] 9.50 10.81 11.17 8.71 12.29 6.41 5.87 5.27 8.25 2.07 4.42 4.90
## [13] 11.91 0.91 8.85 6.50 8.55 10.64 9.14 4.42 8.47 5.33 4.81 4.53
## [25] 5.58 6.20 8.98 9.31 8.54 8.19 6.52 7.62 6.52 7.44 3.90 10.21
## [37] 9.03 10.48 5.32 3.62 8.19 9.39 8.23 9.70 3.45 4.10 13.14 9.43
## [49] 5.53 10.00 6.90 9.16 8.31 3.47 7.77 9.10 12.98 10.04 7.22 6.67
## [61] 7.22 6.88 6.98 15.63 6.97 9.16 6.23 3.15 7.38 7.81 8.97 5.30
## [73] 5.25 10.50 14.37 10.26 7.68 9.44 4.95 5.94
mean(abs(tree.pred-car.test$Sales)/car.test$Sales)
## [1] 0.2885341
The base tree model has a 28.6% misclassification error
set.seed(123)
tree.cp <- rpart(Sales ~. , data = car.train, method = 'anova', cp = 0.001)
best.prune <- tree.cp$cptable[which.min(tree.cp$cptable[,"xerror"]),"CP"]
printcp(tree.cp)
##
## Regression tree:
## rpart(formula = Sales ~ ., data = car.train, method = "anova",
## cp = 0.001)
##
## Variables actually used in tree construction:
## [1] Advertising Age CompPrice Education Income Price
## [7] ShelveLoc
##
## Root node error: 2575.3/320 = 8.0478
##
## n= 320
##
## CP nsplit rel error xerror xstd
## 1 0.2435711 0 1.00000 1.01126 0.076807
## 2 0.1212104 1 0.75643 0.77175 0.059602
## 3 0.0481666 2 0.63522 0.65436 0.051007
## 4 0.0414491 3 0.58705 0.66756 0.051662
## 5 0.0362224 4 0.54560 0.65752 0.051197
## 6 0.0301673 5 0.50938 0.65033 0.050563
## 7 0.0251026 6 0.47921 0.65079 0.049119
## 8 0.0226317 7 0.45411 0.66650 0.053135
## 9 0.0209945 8 0.43148 0.67189 0.052467
## 10 0.0204290 9 0.41048 0.66093 0.051090
## 11 0.0196127 10 0.39006 0.65560 0.051242
## 12 0.0138903 11 0.37044 0.63541 0.051438
## 13 0.0137574 12 0.35655 0.63848 0.050678
## 14 0.0137470 13 0.34279 0.63848 0.050678
## 15 0.0115919 14 0.32905 0.65127 0.050855
## 16 0.0108667 15 0.31746 0.62932 0.048178
## 17 0.0107702 16 0.30659 0.62695 0.047912
## 18 0.0100950 17 0.29582 0.62969 0.047955
## 19 0.0098496 18 0.28572 0.62552 0.047836
## 20 0.0084243 19 0.27587 0.60608 0.046808
## 21 0.0075961 20 0.26745 0.59726 0.046408
## 22 0.0069510 21 0.25985 0.59449 0.046043
## 23 0.0063559 22 0.25290 0.58948 0.045976
## 24 0.0060326 23 0.24655 0.58513 0.045939
## 25 0.0032583 24 0.24051 0.56495 0.044966
## 26 0.0032329 25 0.23726 0.56592 0.045179
## 27 0.0010000 26 0.23402 0.56682 0.045119
plotcp(tree.cp)
prune <- prune(tree.cp, c= tree.cp$cptable[which.min(tree.cp$cptable[, "xerror"]), "CP"])
rpart.plot(prune, uniform=TRUE)
tree.pred.prune <- predict(prune, car.test)
mean(abs((tree.pred.prune-car.test$Sales)/car.test$Sales))
## [1] 0.2806578
The pruned tree model improved the accuracy rate by 1% reducing missclassification rate to 28.06%
set.seed(123)
bag_model <- randomForest(Sales ~. ,
data = car.train,
importance = TRUE,
mtry = 10
)
bag_model
##
## Call:
## randomForest(formula = Sales ~ ., data = car.train, importance = TRUE, mtry = 10)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 10
##
## Mean of squared residuals: 2.406538
## % Var explained: 70.1
importance(bag_model)
## %IncMSE IncNodePurity
## CompPrice 34.6620426 256.672345
## Income 11.8486784 138.164924
## Advertising 23.8993524 205.555766
## Population -1.0853189 70.697339
## Price 71.0334051 770.670007
## ShelveLoc 85.8945742 774.259202
## Age 21.8597528 204.257940
## Education 1.5611812 69.739059
## Urban 0.9583677 8.528455
## US 4.6256758 19.660925
varImpPlot(bag_model)
bag.pred <- predict(bag_model, car.test)
mean(abs((bag.pred-car.test$Sales)/car.test$Sales))
## [1] 0.2200931
after bagging the model improved by 6% from the basic model reducing misclassification error rate to 22.00%
set.seed(123)
forest_model <- randomForest(Sales ~. ,
data = car.train,
importance = TRUE
)
forest_model
##
## Call:
## randomForest(formula = Sales ~ ., data = car.train, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 2.754739
## % Var explained: 65.77
importance(forest_model)
## %IncMSE IncNodePurity
## CompPrice 18.6739548 234.27064
## Income 5.5335988 188.55399
## Advertising 19.8769289 243.65559
## Population -0.5766713 140.92935
## Price 45.1235953 630.71363
## ShelveLoc 53.1645538 597.19152
## Age 16.2152203 247.31704
## Education 2.2468307 107.73122
## Urban -0.5131205 21.17487
## US 5.5550820 38.01989
varImpPlot(forest_model)
forest.pred <- predict(forest_model, car.test)
mean(abs((forest.pred-car.test$Sales)/car.test$Sales))
## [1] 0.2553502
after bagging the model improved by 3% from the basic model reducing misclassification error rate to 25.7%
ojdata <- OJ
set.seed(123)
oj.train_ind = sample(seq_len(nrow(ojdata)), size = 800)
oj.train = ojdata[oj.train_ind,]
oj.test = ojdata[-oj.train_ind,]
set.seed(123)
oj.tree <- tree(Purchase ~. , data = oj.train, method = 'class')
summary(oj.tree)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = oj.train, method = "class")
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff"
## Number of terminal nodes: 8
## Residual mean deviance: 0.7625 = 603.9 / 792
## Misclassification error rate: 0.165 = 132 / 800
8 terminal nodes and 16.5% error rate
oj.tree
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1071.00 CH ( 0.60875 0.39125 )
## 2) LoyalCH < 0.5036 350 415.10 MM ( 0.28000 0.72000 )
## 4) LoyalCH < 0.276142 170 131.00 MM ( 0.12941 0.87059 )
## 8) LoyalCH < 0.0356415 56 10.03 MM ( 0.01786 0.98214 ) *
## 9) LoyalCH > 0.0356415 114 108.90 MM ( 0.18421 0.81579 ) *
## 5) LoyalCH > 0.276142 180 245.20 MM ( 0.42222 0.57778 )
## 10) PriceDiff < 0.05 74 74.61 MM ( 0.20270 0.79730 ) *
## 11) PriceDiff > 0.05 106 144.50 CH ( 0.57547 0.42453 ) *
## 3) LoyalCH > 0.5036 450 357.10 CH ( 0.86444 0.13556 )
## 6) PriceDiff < -0.39 27 32.82 MM ( 0.29630 0.70370 ) *
## 7) PriceDiff > -0.39 423 273.70 CH ( 0.90071 0.09929 )
## 14) LoyalCH < 0.705326 130 135.50 CH ( 0.78462 0.21538 )
## 28) PriceDiff < 0.145 43 58.47 CH ( 0.58140 0.41860 ) *
## 29) PriceDiff > 0.145 87 62.07 CH ( 0.88506 0.11494 ) *
## 15) LoyalCH > 0.705326 293 112.50 CH ( 0.95222 0.04778 ) *
for node 2 loych splitting criterion is 0.503 and gas a deviation if 350. there is a 28% probability to fall under and 72% for it to be over
plot(oj.tree)
text(oj.tree,pretty=0)
It seems that that pricediff and loyalCH have an equally important it as has a set of 2 nodes each.
oj.tree.pred <- predict(oj.tree, oj.test, type = "class")
oj.tree.table <- table(oj.test$Purchase, oj.tree.pred)
oj.tree.table
## oj.tree.pred
## CH MM
## CH 150 16
## MM 34 70
(oj.tree.table[2,1]+oj.tree.table[1,2])/sum(oj.tree.table)
## [1] 0.1851852
there is a 18.5% misclassification error rate
set.seed(123)
oj.tree.cp <- rpart(Purchase ~. , data = oj.train, method = 'class', cp = 0.001)
oj.best.prune <- oj.tree.cp$cptable[which.min(oj.tree.cp$cptable[,"xerror"]),"CP"]
printcp(oj.tree.cp)
##
## Classification tree:
## rpart(formula = Purchase ~ ., data = oj.train, method = "class",
## cp = 0.001)
##
## Variables actually used in tree construction:
## [1] LoyalCH PriceCH PriceDiff SpecialCH STORE
## [6] StoreID WeekofPurchase
##
## Root node error: 313/800 = 0.39125
##
## n= 800
##
## CP nsplit rel error xerror xstd
## 1 0.4920128 0 1.00000 1.00000 0.044101
## 2 0.0351438 1 0.50799 0.53355 0.036726
## 3 0.0255591 2 0.47284 0.53355 0.036726
## 4 0.0127796 4 0.42173 0.44728 0.034336
## 5 0.0095847 7 0.38339 0.44728 0.034336
## 6 0.0039936 8 0.37380 0.46326 0.034811
## 7 0.0031949 12 0.35783 0.48562 0.035450
## 8 0.0015974 14 0.35144 0.49840 0.035803
## 9 0.0010000 16 0.34824 0.50479 0.035975
plotcp(oj.tree.cp)
oj.prune <- prune(oj.tree.cp, c= oj.tree.cp$cptable[which.min(oj.tree.cp$cptable[, "xerror"]), "CP"])
which.min(oj.tree.cp$cptable[,"xerror"])
## 4
## 4
the size is 4 and 7 trees
rpart.plot(oj.prune, uniform=TRUE)
oj.pred_train <- predict(oj.tree, type = "class")
mean(oj.pred_train != oj.train$Purchase)
## [1] 0.165
oj.pred_test <- predict(oj.prune, type = "class")
mean(oj.pred_test != oj.train$Purchase)
## [1] 0.165
the classification accuracy is the same for both model