Exercise 3

p = seq(0, 1, 0.01)
gini = p * (1 - p) * 2
entropy = -(p * log(p) + (1 - p) * log(1 - p))
classerror = 1 - pmax(p, 1 - p)
plot(NA,NA,xlim=c(0,1),ylim=c(0,1),xlab='p',ylab='gini, entropy, classerror')

lines(p,gini,type='l')
lines(p,classerror,col='blue')
lines(p,entropy,col='red')

Exercise 8(A)

library(ISLR)

str(Carseats)
## 'data.frame':    400 obs. of  11 variables:
##  $ Sales      : num  9.5 11.22 10.06 7.4 4.15 ...
##  $ CompPrice  : num  138 111 113 117 141 124 115 136 132 132 ...
##  $ Income     : num  73 48 35 100 64 113 105 81 110 113 ...
##  $ Advertising: num  11 16 10 4 3 13 0 15 0 0 ...
##  $ Population : num  276 260 269 466 340 501 45 425 108 131 ...
##  $ Price      : num  120 83 80 97 128 72 108 120 124 124 ...
##  $ ShelveLoc  : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
##  $ Age        : num  42 65 59 55 38 78 71 67 76 76 ...
##  $ Education  : num  17 10 12 14 13 16 15 10 10 17 ...
##  $ Urban      : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
##  $ US         : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
attach(Carseats)
set.seed(1)

train = sample(dim(Carseats)[1], dim(Carseats)[1]/2)
train.car = Carseats[train, ]
test.car = Carseats[-train, ]

8B. Test MSE obtained is 4.15.

library(tree)
carseats.tree = tree(Sales ~ ., data = train.car)
summary(carseats.tree)
## 
## Regression tree:
## tree(formula = Sales ~ ., data = train.car)
## Variables actually used in tree construction:
## [1] "ShelveLoc"   "Price"       "Age"         "Advertising" "CompPrice"  
## [6] "US"         
## Number of terminal nodes:  18 
## Residual mean deviance:  2.167 = 394.3 / 182 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -3.88200 -0.88200 -0.08712  0.00000  0.89590  4.09900
plot(carseats.tree)
text(carseats.tree, pretty = 0, cex = 0.60)

carseats.pred = predict(carseats.tree, test.car)
mean((test.car$Sales - carseats.pred)^2)
## [1] 4.922039

8C. It improves the test MSE to 4.99.

cv.carseats = cv.tree(carseats.tree, FUN = prune.tree)
par(mfrow = c(1, 2))
plot(cv.carseats$size, cv.carseats$dev, type = "b")
plot(cv.carseats$k, cv.carseats$dev, type = "b")

carseats.pr = prune.tree(carseats.tree, best = 9)
par(mfrow = c(1, 1))
plot(carseats.pr)
text(carseats.pr, pretty = 0)

pred.pruned = predict(carseats.pr, test.car)
mean((test.car$Sales - pred.pruned)^2)
## [1] 4.918134

8D. The test MSE is 2.58. The most important variables are Price, SHelveloc and Age.

library(randomForest)
## Warning: package 'randomForest' was built under R version 4.1.2
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
bag.tree = randomForest(Sales ~ ., data = train.car, mtry = 10, ntree = 500, importance = T)
bag.pred = predict(bag.tree, test.car)
mean((test.car$Sales - bag.pred)^2)
## [1] 2.657296
importance(bag.tree)
##                 %IncMSE IncNodePurity
## CompPrice   23.07909904    171.185734
## Income       2.82081527     94.079825
## Advertising 11.43295625     99.098941
## Population  -3.92119532     59.818905
## Price       54.24314632    505.887016
## ShelveLoc   46.26912996    361.962753
## Age         14.24992212    159.740422
## Education   -0.07662320     46.738585
## Urban        0.08530119      8.453749
## US           4.34349223     15.157608

8E. Test MSE is now 2.86. Price, Shelveloc and Age are still important variables.

car.rf = randomForest(Sales ~ ., data = train.car, mtry = 5, ntree = 500, importance = T)
rf.pred = predict(car.rf, test.car)
mean((test.car$Sales - rf.pred)^2)
## [1] 2.701665
importance(car.rf)
##                %IncMSE IncNodePurity
## CompPrice   19.8160444     162.73603
## Income       2.8940268     106.96093
## Advertising 11.6799573     106.30923
## Population  -1.6998805      79.04937
## Price       46.3454015     448.33554
## ShelveLoc   40.4412189     334.33610
## Age         12.5440659     169.06125
## Education    1.0762096      55.87510
## Urban        0.5703583      13.21963
## US           5.8799999      25.59797

Exercie 9(A).

set.seed(42)

train_ind <- sample(1:nrow(OJ), 800)

train <- OJ[train_ind,]
test <- OJ[-train_ind,]

9B. The tree only uses two variables, LoyalCH and PriceDiff. It has 8 terminal nodes. Training error rate is 0.176.

oj.tree <- tree(Purchase ~., data = train)
summary(oj.tree)
## 
## Classification tree:
## tree(formula = Purchase ~ ., data = train)
## Variables actually used in tree construction:
## [1] "LoyalCH"     "SalePriceMM" "PriceDiff"  
## Number of terminal nodes:  8 
## Residual mean deviance:  0.7392 = 585.5 / 792 
## Misclassification error rate: 0.1638 = 131 / 800

9C. Picking terminal node labeled “14”. The splitting variable at this node is PriceDiff. The splitting value of this node is 0.04.

oj.tree
## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 800 1066.00 CH ( 0.61500 0.38500 )  
##    2) LoyalCH < 0.48285 285  296.00 MM ( 0.21404 0.78596 )  
##      4) LoyalCH < 0.064156 64    0.00 MM ( 0.00000 1.00000 ) *
##      5) LoyalCH > 0.064156 221  260.40 MM ( 0.27602 0.72398 )  
##       10) SalePriceMM < 2.04 128  123.50 MM ( 0.18750 0.81250 ) *
##       11) SalePriceMM > 2.04 93  125.00 MM ( 0.39785 0.60215 ) *
##    3) LoyalCH > 0.48285 515  458.10 CH ( 0.83689 0.16311 )  
##      6) LoyalCH < 0.753545 230  282.70 CH ( 0.69565 0.30435 )  
##       12) PriceDiff < 0.265 149  203.00 CH ( 0.57718 0.42282 )  
##         24) PriceDiff < -0.165 32   38.02 MM ( 0.28125 0.71875 ) *
##         25) PriceDiff > -0.165 117  150.30 CH ( 0.65812 0.34188 )  
##           50) LoyalCH < 0.703993 105  139.60 CH ( 0.61905 0.38095 ) *
##           51) LoyalCH > 0.703993 12    0.00 CH ( 1.00000 0.00000 ) *
##       13) PriceDiff > 0.265 81   47.66 CH ( 0.91358 0.08642 ) *
##      7) LoyalCH > 0.753545 285  111.70 CH ( 0.95088 0.04912 ) *

9D. Looking at the tree, the left branch has lower loyalty for citrus hill than minute maid. While the right tree branch shows a strong preference for citrus hill.

plot(oj.tree)
text(oj.tree)

9E.

oj.pred = predict(oj.tree, test, type = "class")
table(test$Purchase, oj.pred)
##     oj.pred
##       CH  MM
##   CH 125  36
##   MM  15  94

9F.

oj.cv = cv.tree(oj.tree, FUN = prune.tree)

9G.

plot(oj.cv$size, oj.cv$dev, type = "b", xlab = "Tree Size", ylab = "Deviance")

9H.Tree size 8, gives the lowest cross-validation.

9I-K

oj.prun = prune.tree(oj.tree, best = 8)
summary(oj.prun)
## 
## Classification tree:
## tree(formula = Purchase ~ ., data = train)
## Variables actually used in tree construction:
## [1] "LoyalCH"     "SalePriceMM" "PriceDiff"  
## Number of terminal nodes:  8 
## Residual mean deviance:  0.7392 = 585.5 / 792 
## Misclassification error rate: 0.1638 = 131 / 800

Training error rate is 0.176 same as the unpruned tree.

pred.unprun = predict(oj.tree, test, type = "class")
mis.unprun = sum(test$Purchase != pred.unprun)
mis.unprun/length(pred.unprun)
## [1] 0.1888889
pred.prun = predict(oj.tree, test, type = "class")
mis.prun = sum(test$Purchase != pred.prun)
mis.prun/length(pred.prun)
## [1] 0.1888889

Pruned and pruned trees have the same test error rate of 0.185