PROBLEM 3

Consider the Gini index, classification error, and entropy in a simple classification setting with two classes. Create a single plot that displays each of these quantities as a function of ˆpm1. The xaxis should display ˆpm1, ranging from 0 to 1, and the y-axis should display the value of the Gini index, classification error, and entropy. Hint: In a setting with two classes, pˆm1 = 1 − pˆm2. You could make this plot by hand, but it will be much easier to make in R.

p <- seq(0, 1, 0.01)
gini.index <- 2 * p * (1 - p)
class.error <- 1 - pmax(p, 1 - p)
cross.entropy <- - (p * log(p) + (1 - p) * log(1 - p))
matplot(p, cbind(gini.index, class.error, cross.entropy), col = c("black", "orange", "blue"), xlab=expression(italic(hat(p))[m1]), ylab = "Quantities")

PROBLEM 8

In the lab, a classification tree was applied to the Carseats data set after converting Sales into a qualitative response variable. Now we will seek to predict Sales using regression trees and related approaches,treating the response as a quantitative variable.

  1. Splitting the Data
library(ISLR)
set.seed(1)
train <- sample(1:nrow(Carseats), nrow(Carseats) / 2)
train.car<- Carseats[train, ]
test.car<- Carseats[-train, ]

Training Data = 200 observations Test Data = 200 observations

  1. Regression Tree
library(tree)
tree.car<- tree(Sales ~ ., data = train.car)
summary(tree.car)
## 
## Regression tree:
## tree(formula = Sales ~ ., data = train.car)
## Variables actually used in tree construction:
## [1] "ShelveLoc"   "Price"       "Age"         "Advertising" "Income"     
## [6] "CompPrice"  
## Number of terminal nodes:  18 
## Residual mean deviance:  2.36 = 429.5 / 182 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -4.2570 -1.0360  0.1024  0.0000  0.9301  3.9130

Plotting the tree

plot(tree.car)
text(tree.car, pretty = 0)

MSE

prediction<-predict(tree.car, newdata = test.car)
mean((prediction- test.car$Sales)^2)
## [1] 4.148897

Error = 4.148897

  1. Cross-Validation
cv.car<- cv.tree(tree.car)
plot(cv.car$size, cv.car$dev, type = "b")
tree.min <- which.min(cv.car$dev)
points(tree.min, cv.car$dev[tree.min], col = "red", cex = 2, pch = 9)

Tree of Size 8 can be chosen. Prune to obtain an 8-node tree.

prune.car<- prune.tree(tree.car, best = 8)
plot(prune.car)
text(prune.car, pretty = 0)

MSE

predict2<- predict(prune.car, newdata = test.car)
mean((predict2- test.car$Sales)^2)
## [1] 5.09085

Error = 5.09085

  1. Bagging
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
bag.car<- randomForest(Sales ~ ., data = train.car, mtry = 10, ntree = 500, importance = TRUE)
predict3<- predict(bag.car, newdata = test.car)
mean((predict3- test.car$Sales)^2)
## [1] 2.633915

Error = 2.546545

Important Variables

importance (bag.car)
##                %IncMSE IncNodePurity
## CompPrice   16.9874366    126.852848
## Income       3.8985402     78.314126
## Advertising 16.5698586    123.702901
## Population   0.6487058     62.328851
## Price       55.3976775    514.654890
## ShelveLoc   42.7849818    319.133777
## Age         20.5135255    185.582077
## Education    3.4615211     42.253410
## Urban       -2.5125087      8.700009
## US           7.3586645     18.180651

Price and ShelveLoc are the two most important variables

  1. Random Forest
rf.car<- randomForest(Sales ~ ., data = train.car, mtry = 3, ntree = 500, importance = TRUE)
predict4<- predict(rf.car, newdata = test.car)
mean((predict4- test.car$Sales)^2)
## [1] 3.321154

Error = 3.306537

Important Variables

importance(rf.car)
##               %IncMSE IncNodePurity
## CompPrice    7.443405     130.87552
## Income       3.227858     127.18662
## Advertising 13.388259     139.53499
## Population  -1.031306     102.32154
## Price       36.616911     369.59534
## ShelveLoc   31.284175     233.49549
## Age         17.622273     206.09959
## Education    1.454555      70.41374
## Urban       -1.864781      15.13225
## US           6.193082      35.74746

Price and ShelveLoc are the two most important variables

PROBLEM 9

This problem involves the OJ data set which is part of the ISLR package.

  1. Creating a training set
set.seed(1)
train <- sample(1:nrow(OJ), 800)
train.OJ<- OJ[train, ]
test.OJ<- OJ[-train, ]
  1. Fitting a tree
tree.OJ<- tree(Purchase ~ ., data = train.OJ)
summary(tree.OJ)
## 
## Classification tree:
## tree(formula = Purchase ~ ., data = train.OJ)
## Variables actually used in tree construction:
## [1] "LoyalCH"       "PriceDiff"     "SpecialCH"     "ListPriceDiff"
## Number of terminal nodes:  8 
## Residual mean deviance:  0.7305 = 578.6 / 792 
## Misclassification error rate: 0.165 = 132 / 800

8-terminal nodes and error rate of 0.165

tree.OJ
## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 800 1064.00 CH ( 0.61750 0.38250 )  
##    2) LoyalCH < 0.508643 350  409.30 MM ( 0.27143 0.72857 )  
##      4) LoyalCH < 0.264232 166  122.10 MM ( 0.12048 0.87952 )  
##        8) LoyalCH < 0.0356415 57   10.07 MM ( 0.01754 0.98246 ) *
##        9) LoyalCH > 0.0356415 109  100.90 MM ( 0.17431 0.82569 ) *
##      5) LoyalCH > 0.264232 184  248.80 MM ( 0.40761 0.59239 )  
##       10) PriceDiff < 0.195 83   91.66 MM ( 0.24096 0.75904 )  
##         20) SpecialCH < 0.5 70   60.89 MM ( 0.15714 0.84286 ) *
##         21) SpecialCH > 0.5 13   16.05 CH ( 0.69231 0.30769 ) *
##       11) PriceDiff > 0.195 101  139.20 CH ( 0.54455 0.45545 ) *
##    3) LoyalCH > 0.508643 450  318.10 CH ( 0.88667 0.11333 )  
##      6) LoyalCH < 0.764572 172  188.90 CH ( 0.76163 0.23837 )  
##       12) ListPriceDiff < 0.235 70   95.61 CH ( 0.57143 0.42857 ) *
##       13) ListPriceDiff > 0.235 102   69.76 CH ( 0.89216 0.10784 ) *
##      7) LoyalCH > 0.764572 278   86.14 CH ( 0.96403 0.03597 ) *

We pick the node labelled 7. The split criterion is LoyalCH >0.764572, the number of observations in that branch is 278 with a deviance of 86.14 and an overall prediction for the branch of CH.

  1. Plot
plot(tree.OJ)
text(tree.OJ, pretty = 0)

Loyal CH seems to be the most important indicator of Purchase.

  1. Matrix
tree.pred <- predict(tree.OJ, test.OJ, type = "class")
table(tree.pred, test.OJ$Purchase)
##          
## tree.pred  CH  MM
##        CH 147  49
##        MM  12  62

error rate

1 - (147 + 62) / 270
## [1] 0.2259259

error rate = 0.2259259

  1. Cross-Validation Tree
cv.OJ <- cv.tree(tree.OJ, FUN = prune.misclass)
cv.OJ
## $size
## [1] 8 5 2 1
## 
## $dev
## [1] 156 156 156 306
## 
## $k
## [1]       -Inf   0.000000   4.666667 160.000000
## 
## $method
## [1] "misclass"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"
  1. Plot
plot(cv.OJ$size, cv.OJ$dev, type = "b", xlab = "Tree size", ylab = "Cross-Validated Classification Error")

  1. 2-node tree is the smallest tree with the lowest classification error rate.

  2. Pruned Tree

prune.OJ <- prune.misclass(tree.OJ, best = 2)
plot(prune.OJ)
text(prune.OJ, pretty = 0)

  1. Comparing Error Rates
summary(tree.OJ)
## 
## Classification tree:
## tree(formula = Purchase ~ ., data = train.OJ)
## Variables actually used in tree construction:
## [1] "LoyalCH"       "PriceDiff"     "SpecialCH"     "ListPriceDiff"
## Number of terminal nodes:  8 
## Residual mean deviance:  0.7305 = 578.6 / 792 
## Misclassification error rate: 0.165 = 132 / 800
summary(prune.OJ)
## 
## Classification tree:
## snip.tree(tree = tree.OJ, nodes = 3:2)
## Variables actually used in tree construction:
## [1] "LoyalCH"
## Number of terminal nodes:  2 
## Residual mean deviance:  0.9115 = 727.4 / 798 
## Misclassification error rate: 0.1825 = 146 / 800

Misclassification error rate is higher for Pruned Tree

prune.pred <- predict(prune.OJ, test.OJ, type = "class")
table(prune.pred, test.OJ$Purchase)
##           
## prune.pred  CH  MM
##         CH 119  30
##         MM  40  81

Error

1 - (119 + 81) / 270
## [1] 0.2592593

Pruning increased the error rate by 0.2592593.