p = seq(0, 1, 0.01)
gini = p * (1 - p) * 2
entropy = -(p * log(p) + (1 - p) * log(1 - p))
classerror = 1 - pmax(p, 1 - p)
plot(NA,NA,xlim=c(0,1),ylim=c(0,1),xlab='p',ylab='gini, entropy, classerror')
lines(p,gini,type='l')
lines(p,classerror,col='blue')
lines(p,entropy,col='red')
library(ISLR)
str(Carseats)
## 'data.frame': 400 obs. of 11 variables:
## $ Sales : num 9.5 11.22 10.06 7.4 4.15 ...
## $ CompPrice : num 138 111 113 117 141 124 115 136 132 132 ...
## $ Income : num 73 48 35 100 64 113 105 81 110 113 ...
## $ Advertising: num 11 16 10 4 3 13 0 15 0 0 ...
## $ Population : num 276 260 269 466 340 501 45 425 108 131 ...
## $ Price : num 120 83 80 97 128 72 108 120 124 124 ...
## $ ShelveLoc : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
## $ Age : num 42 65 59 55 38 78 71 67 76 76 ...
## $ Education : num 17 10 12 14 13 16 15 10 10 17 ...
## $ Urban : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
## $ US : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
attach(Carseats)
set.seed(1)
train = sample(dim(Carseats)[1], dim(Carseats)[1]/2)
train.car = Carseats[train, ]
test.car = Carseats[-train, ]
8B. Test MSE obtained is 4.15.
library(tree)
carseats.tree = tree(Sales ~ ., data = train.car)
summary(carseats.tree)
##
## Regression tree:
## tree(formula = Sales ~ ., data = train.car)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Age" "Advertising" "CompPrice"
## [6] "US"
## Number of terminal nodes: 18
## Residual mean deviance: 2.167 = 394.3 / 182
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.88200 -0.88200 -0.08712 0.00000 0.89590 4.09900
plot(carseats.tree)
text(carseats.tree, pretty = 0, cex = 0.60)
carseats.pred = predict(carseats.tree, test.car)
mean((test.car$Sales - carseats.pred)^2)
## [1] 4.922039
8C. It improves the test MSE to 4.99.
cv.carseats = cv.tree(carseats.tree, FUN = prune.tree)
par(mfrow = c(1, 2))
plot(cv.carseats$size, cv.carseats$dev, type = "b")
plot(cv.carseats$k, cv.carseats$dev, type = "b")
carseats.pr = prune.tree(carseats.tree, best = 9)
par(mfrow = c(1, 1))
plot(carseats.pr)
text(carseats.pr, pretty = 0)
pred.pruned = predict(carseats.pr, test.car)
mean((test.car$Sales - pred.pruned)^2)
## [1] 4.918134
8D. The test MSE is 2.58. The most important variables are Price, SHelveloc and Age.
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.1.2
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
bag.tree = randomForest(Sales ~ ., data = train.car, mtry = 10, ntree = 500, importance = T)
bag.pred = predict(bag.tree, test.car)
mean((test.car$Sales - bag.pred)^2)
## [1] 2.657296
importance(bag.tree)
## %IncMSE IncNodePurity
## CompPrice 23.07909904 171.185734
## Income 2.82081527 94.079825
## Advertising 11.43295625 99.098941
## Population -3.92119532 59.818905
## Price 54.24314632 505.887016
## ShelveLoc 46.26912996 361.962753
## Age 14.24992212 159.740422
## Education -0.07662320 46.738585
## Urban 0.08530119 8.453749
## US 4.34349223 15.157608
8E. Test MSE is now 2.86. Price, Shelveloc and Age are still important variables.
car.rf = randomForest(Sales ~ ., data = train.car, mtry = 5, ntree = 500, importance = T)
rf.pred = predict(car.rf, test.car)
mean((test.car$Sales - rf.pred)^2)
## [1] 2.701665
importance(car.rf)
## %IncMSE IncNodePurity
## CompPrice 19.8160444 162.73603
## Income 2.8940268 106.96093
## Advertising 11.6799573 106.30923
## Population -1.6998805 79.04937
## Price 46.3454015 448.33554
## ShelveLoc 40.4412189 334.33610
## Age 12.5440659 169.06125
## Education 1.0762096 55.87510
## Urban 0.5703583 13.21963
## US 5.8799999 25.59797
set.seed(42)
train_ind <- sample(1:nrow(OJ), 800)
train <- OJ[train_ind,]
test <- OJ[-train_ind,]
9B. The tree only uses two variables, LoyalCH and PriceDiff. It has 8 terminal nodes. Training error rate is 0.176.
oj.tree <- tree(Purchase ~., data = train)
summary(oj.tree)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "SalePriceMM" "PriceDiff"
## Number of terminal nodes: 8
## Residual mean deviance: 0.7392 = 585.5 / 792
## Misclassification error rate: 0.1638 = 131 / 800
9C. Picking terminal node labeled “14”. The splitting variable at this node is PriceDiff. The splitting value of this node is 0.04.
oj.tree
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1066.00 CH ( 0.61500 0.38500 )
## 2) LoyalCH < 0.48285 285 296.00 MM ( 0.21404 0.78596 )
## 4) LoyalCH < 0.064156 64 0.00 MM ( 0.00000 1.00000 ) *
## 5) LoyalCH > 0.064156 221 260.40 MM ( 0.27602 0.72398 )
## 10) SalePriceMM < 2.04 128 123.50 MM ( 0.18750 0.81250 ) *
## 11) SalePriceMM > 2.04 93 125.00 MM ( 0.39785 0.60215 ) *
## 3) LoyalCH > 0.48285 515 458.10 CH ( 0.83689 0.16311 )
## 6) LoyalCH < 0.753545 230 282.70 CH ( 0.69565 0.30435 )
## 12) PriceDiff < 0.265 149 203.00 CH ( 0.57718 0.42282 )
## 24) PriceDiff < -0.165 32 38.02 MM ( 0.28125 0.71875 ) *
## 25) PriceDiff > -0.165 117 150.30 CH ( 0.65812 0.34188 )
## 50) LoyalCH < 0.703993 105 139.60 CH ( 0.61905 0.38095 ) *
## 51) LoyalCH > 0.703993 12 0.00 CH ( 1.00000 0.00000 ) *
## 13) PriceDiff > 0.265 81 47.66 CH ( 0.91358 0.08642 ) *
## 7) LoyalCH > 0.753545 285 111.70 CH ( 0.95088 0.04912 ) *
9D. Looking at the tree, the left branch has lower loyalty for citrus hill than minute maid. While the right tree branch shows a strong preference for citrus hill.
plot(oj.tree)
text(oj.tree)
9E.
oj.pred = predict(oj.tree, test, type = "class")
table(test$Purchase, oj.pred)
## oj.pred
## CH MM
## CH 125 36
## MM 15 94
9F.
oj.cv = cv.tree(oj.tree, FUN = prune.tree)
9G.
plot(oj.cv$size, oj.cv$dev, type = "b", xlab = "Tree Size", ylab = "Deviance")
9H.Tree size 8, gives the lowest cross-validation.
9I-K
oj.prun = prune.tree(oj.tree, best = 8)
summary(oj.prun)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "SalePriceMM" "PriceDiff"
## Number of terminal nodes: 8
## Residual mean deviance: 0.7392 = 585.5 / 792
## Misclassification error rate: 0.1638 = 131 / 800
Training error rate is 0.176 same as the unpruned tree.
pred.unprun = predict(oj.tree, test, type = "class")
mis.unprun = sum(test$Purchase != pred.unprun)
mis.unprun/length(pred.unprun)
## [1] 0.1888889
pred.prun = predict(oj.tree, test, type = "class")
mis.prun = sum(test$Purchase != pred.prun)
mis.prun/length(pred.prun)
## [1] 0.1888889
Pruned and pruned trees have the same test error rate of 0.185