Exercise 3 - Consider the Gini index, classification error, and entropy in a simple classification setting with two classes. Create a single plot that displays each of these quantities as a function of pˆm1.The x-axis should display pˆm1, ranging from 0 to 1, and the y-axis should display the value of the Gini index, classification error, and entropy.
p <- seq(0, 1, 0.01)
gini.index <- 2 * p * (1 - p)
class.error <- 1 - pmax(p, 1 - p)
entropy <- - (p * log(p) + (1 - p) * log(1 - p))
matplot(p, cbind(class.error, gini.index, entropy), col = c("blue", "green", "red" ))
Exercise 8 - In the lab, a classification tree was applied to the Carseats data set after converting Sales into a qualitative response variable. Now we will seek to predict Sales using regression trees and related approaches, treating the response as a quantitative variable.
data(Carseats)
attach(Carseats)
set.seed(1)
train = sample(1:nrow(Carseats), nrow(Carseats)/2)
Carseats.train = Carseats[train, ]
Carseats.test = Carseats[-train, ]
tree.carseats = tree(Sales~ ., data = Carseats.train)
summary(tree.carseats)
##
## Regression tree:
## tree(formula = Sales ~ ., data = Carseats.train)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Age" "Advertising" "CompPrice"
## [6] "US"
## Number of terminal nodes: 18
## Residual mean deviance: 2.167 = 394.3 / 182
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.88200 -0.88200 -0.08712 0.00000 0.89590 4.09900
plot(tree.carseats)
text(tree.carseats, pretty = 0)
regpred.carseats = predict(tree.carseats, Carseats.test)
mean((Carseats.test$Sales - regpred.carseats)^2)
## [1] 4.922039
set.seed(2)
cv.carseats = cv.tree(tree.carseats)
plot(cv.carseats$size, cv.carseats$dev, xlab = "Tree size", ylab = "Classification Error Rate")
prune.carseats = prune.tree(tree.carseats, best = 9)
plot(prune.carseats)
text(prune.carseats, pretty = 0)
set.seed(3)
bag.carseats = randomForest(Sales~ ., data = Carseats, subset = train, mtry = 10, importance = TRUE)
bag.carseats
##
## Call:
## randomForest(formula = Sales ~ ., data = Carseats, mtry = 10, importance = TRUE, subset = train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 10
##
## Mean of squared residuals: 2.90505
## % Var explained: 63.06
bagpred.carseats = predict(bag.carseats, newdata = Carseats.test)
mean((Carseats.test$Sales - bagpred.carseats)^2)
## [1] 2.630702
importance(bag.carseats)
## %IncMSE IncNodePurity
## CompPrice 25.6368749 169.596380
## Income 5.5092406 93.989153
## Advertising 13.0320872 99.531672
## Population -1.3427420 56.478915
## Price 52.4411414 498.870909
## ShelveLoc 47.4964524 384.768230
## Age 16.7659587 153.617412
## Education -0.4141856 44.838301
## Urban 0.2914200 9.449681
## US 5.1460185 15.033204
set.seed(4)
rf.carseats = randomForest(Sales~ ., data = Carseats.train, mtry = 8, impotance = T)
rfpred.carseats = predict(rf.carseats, Carseats.test)
mean((Carseats.test$Sales - rfpred.carseats)^2)
## [1] 2.612723
importance(rf.carseats)
## IncNodePurity
## CompPrice 166.998303
## Income 97.792200
## Advertising 102.178454
## Population 62.289204
## Price 499.049584
## ShelveLoc 361.148227
## Age 164.647876
## Education 48.259514
## Urban 8.941162
## US 19.884424
Exercise 9 - This problem involves the OJ data set which is part of the ISLR package.
set.seed(1)
train = sample(dim(OJ)[1],800)
OJ.train = OJ[train,]
OJ.test = OJ[-train,]
OJ.tree = tree(Purchase~., data=OJ.train)
summary(OJ.tree)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ.train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "SpecialCH" "ListPriceDiff"
## [5] "PctDiscMM"
## Number of terminal nodes: 9
## Residual mean deviance: 0.7432 = 587.8 / 791
## Misclassification error rate: 0.1588 = 127 / 800
OJ.tree
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1073.00 CH ( 0.60625 0.39375 )
## 2) LoyalCH < 0.5036 365 441.60 MM ( 0.29315 0.70685 )
## 4) LoyalCH < 0.280875 177 140.50 MM ( 0.13559 0.86441 )
## 8) LoyalCH < 0.0356415 59 10.14 MM ( 0.01695 0.98305 ) *
## 9) LoyalCH > 0.0356415 118 116.40 MM ( 0.19492 0.80508 ) *
## 5) LoyalCH > 0.280875 188 258.00 MM ( 0.44149 0.55851 )
## 10) PriceDiff < 0.05 79 84.79 MM ( 0.22785 0.77215 )
## 20) SpecialCH < 0.5 64 51.98 MM ( 0.14062 0.85938 ) *
## 21) SpecialCH > 0.5 15 20.19 CH ( 0.60000 0.40000 ) *
## 11) PriceDiff > 0.05 109 147.00 CH ( 0.59633 0.40367 ) *
## 3) LoyalCH > 0.5036 435 337.90 CH ( 0.86897 0.13103 )
## 6) LoyalCH < 0.764572 174 201.00 CH ( 0.73563 0.26437 )
## 12) ListPriceDiff < 0.235 72 99.81 MM ( 0.50000 0.50000 )
## 24) PctDiscMM < 0.196197 55 73.14 CH ( 0.61818 0.38182 ) *
## 25) PctDiscMM > 0.196197 17 12.32 MM ( 0.11765 0.88235 ) *
## 13) ListPriceDiff > 0.235 102 65.43 CH ( 0.90196 0.09804 ) *
## 7) LoyalCH > 0.764572 261 91.20 CH ( 0.95785 0.04215 ) *
plot(OJ.tree)
text(OJ.tree,pretty=0, cex = .50)
tree.pred = predict(OJ.tree, newdata = OJ.test, type = "class")
table(tree.pred,OJ.test$Purchase)
##
## tree.pred CH MM
## CH 160 38
## MM 8 64
test_error = 1 - ((160 + 64) / 270)
phrase = "The test error rate is: "
paste(phrase, test_error)
## [1] "The test error rate is: 0.17037037037037"
cv.OJ = cv.tree(OJ.tree, FUN = prune.misclass)
cv.OJ
## $size
## [1] 9 8 7 4 2 1
##
## $dev
## [1] 150 150 149 158 172 315
##
## $k
## [1] -Inf 0.000000 3.000000 4.333333 10.500000 151.000000
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(cv.OJ$size,cv.OJ$dev,type='b', xlab = "Tree size", ylab = "Deviance")
prune.OJ = prune.misclass(OJ.tree, best=7)
plot(prune.OJ)
text(prune.OJ,pretty=0, cex = 0.50)