##Q3 Consider the Gini index, classification error, and entropy in a simple classification setting with two classes. Create a single plot that displays each of these quantities as a function of ˆ pm1. Thexaxis should display ˆ pm1, ranging from 0 to 1, and the y-axis should display the value of the Gini index, classification error, and entropy. Hint: In a setting with two classes, ˆ pm1 =1− ˆ pm2. You could make this plot by hand, but it will be much easier to make in R.
p <- seq(0, 1, 0.01)
gini.index <- 2 * p * (1 - p)
class.error <- 1 - pmax(p, 1 - p)
cross.entropy <- - (p * log(p) + (1 - p) * log(1 - p))
par(bg = "maroon")
matplot(p, cbind(gini.index, class.error, cross.entropy), pch=c(15,17,19) ,ylab = "gini.index, class.error, cross.entropy",col = c("darkolivegreen4" , "wheat", "tomato2"), type = 'b')
legend('bottom', inset=.01, legend = c('gini.index', 'class.error', 'cross.entropy'), col = c("darkolivegreen4" , "wheat", "tomato2"), pch=c(15,17,19))
##Q8 In the lab, a classification tree was applied to the Carseats data set after converting Sales into a qualitative response variable. Now we will seek to predict Sales using regression trees and related approaches, treating the response as a quantitative variable.
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.5
library(tree)
## Warning: package 'tree' was built under R version 4.0.5
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
set.seed(24)
c<-Carseats
train<-sample(1:nrow(c), nrow(c)/2)
c.tr<-c[train,]
c.te<-c[-train,]
tree.c=tree(Sales~., data=c.tr)
plot(tree.c)
text(tree.c, pretty=0)
yhat.tree=predict(tree.c, c.te)
obs.sales<-c.te$Sales
mean((yhat.tree-obs.sales)^2)
## [1] 4.541017
cv.c=cv.tree(tree.c)
#Let's find the lowest $dev value here
cv.c
## $size
## [1] 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1
##
## $dev
## [1] 1012.989 1041.573 1020.262 1043.493 1057.997 1064.050 1131.822 1130.429
## [9] 1211.419 1182.640 1250.044 1239.531 1352.795 1438.036 1382.154 1686.631
##
## $k
## [1] -Inf 17.46067 18.21007 25.43324 29.77121 32.67272 44.19972
## [8] 46.09869 49.55016 50.87585 60.04473 63.14420 99.94859 132.15272
## [15] 143.22869 395.57046
##
## $method
## [1] "deviance"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(cv.c$size,cv.c$dev, xlab = 'size', ylab = 'cv error ')
abline(h = min(cv.c$dev) + .2*sd(cv.c$dev))
We can see in the chart that the size that holds the lowest cv error is size 16. However, we can also see there are a couple other smaller variable sizes that are also very close. Let’s see if we can find which size provides the lowest MSE.
prune.c<-list()
yhat.prune<-list()
mse.prune<-list()
for (i in 2:16) {
prune.c[[i]]=prune.tree(tree.c, best = i)
yhat.prune[[i]]=predict(prune.c[[i]], c.te)
obs.sales<-c.te$Sales
mse.prune[[i]]<-mean((yhat.prune[[i]]-obs.sales)^2)
}
mse.mat<-as.matrix(mse.prune)
mse.df<-as.data.frame(mse.mat)
par(bg = "pink" )
matplot(2:16, mse.df[-1,1], xlab = 'Size', ylab = 'MSE', main = "lowest MSE by size")
lines(2:16, mse.df[-1,1], type = "o")
mse.df <- data.frame(size=2:16,lapply(mse.df, unlist) )
mse.df$mse.rank<-rank(mse.df$V1)
names(mse.df)[2]<-'mse'
mse.df[mse.df$mse.rank %in% 1:3,]
## size mse mse.rank
## 13 14 4.727879 3
## 14 15 4.499964 1
## 15 16 4.541017 2
We can see in the rank that 15 nodes gives the lowest MSE with 4.499964
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
set.seed(24)
bag.c<-randomForest(Sales~., data = c.tr, mtry = 10, importance = TRUE)
yhat.bag<-predict(bag.c, newdata = c.te)
mean((yhat.bag - c.te$Sales)^2)
## [1] 2.692938
importance(bag.c)
## %IncMSE IncNodePurity
## CompPrice 22.8321261 176.578526
## Income 4.1064747 77.183648
## Advertising 12.8743758 99.953925
## Population 0.3779757 76.359264
## Price 51.4435675 465.155918
## ShelveLoc 51.6714645 438.596460
## Age 21.2202209 201.116544
## Education 1.3558349 36.305982
## Urban -2.0566741 6.577210
## US 0.6451619 5.857904
lets try mtry = 1:10
rf.c<- list()
yhat.rf<-list()
mse.rf<-list()
for ( i in 1:10 ) {
set.seed(24)
rf.c[[i]]<-randomForest(Sales ~ ., data = c.tr, mtry = i, importance = TRUE)
yhat.rf[[i]]<-predict(rf.c[[i]], newdata = c.te)
mse.rf[[i]]<- mean((yhat.rf[[i]] - c.te$Sales)^2)
}
par(bg = "green" )
matplot(1:10, mse.rf, xlab = 'mtry', ylab = 'MSE', main = "Lowest MSE by Predictors")
lines(1:10, mse.rf, type = "o")
mseRF<-t(as.data.frame(lapply(mse.rf, unlist)))
mse.rf<-data.frame(size=1:10,mseRF)
mse.rf.rownames<-paste0("mtry", 1:10)
rownames(mse.rf)<-mse.rf.rownames
mse.rf$mse.rank<-rank(mse.rf$mseRF)
mse.rf[order(mse.rf$mse.rank),]
## size mseRF mse.rank
## mtry10 10 2.692938 1
## mtry8 8 2.704865 2
## mtry9 9 2.722630 3
## mtry7 7 2.759607 4
## mtry6 6 2.843481 5
## mtry5 5 2.897282 6
## mtry4 4 2.970406 7
## mtry3 3 3.204131 8
## mtry2 2 3.573238 9
## mtry1 1 4.770207 10
In the table above we can see that mtry = 10 holds the lowest MSE value of 2.692938
##Q9 This problem involves the OJ data set which is part of the ISLR package.
set.seed(10)
library(ISLR)
library(tree)
sam<-sample(1:nrow(OJ), 800)
oj.tr<-OJ[sam,]
oj.te<-OJ[-sam,]
tree.oj<-tree(Purchase~., data = oj.tr)
summary(tree.oj)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = oj.tr)
## Variables actually used in tree construction:
## [1] "LoyalCH" "DiscMM" "PriceDiff"
## Number of terminal nodes: 7
## Residual mean deviance: 0.7983 = 633 / 793
## Misclassification error rate: 0.1775 = 142 / 800
Misclassification error rate: \(17.75%\) Number of terminal nodes: 7 Uses only 3 of the 18 variables: “LoyalCH” “DiscMM” “PriceDiff”
tree.oj
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1067.000 CH ( 0.61375 0.38625 )
## 2) LoyalCH < 0.48285 290 315.900 MM ( 0.23448 0.76552 )
## 4) LoyalCH < 0.035047 51 9.844 MM ( 0.01961 0.98039 ) *
## 5) LoyalCH > 0.035047 239 283.600 MM ( 0.28033 0.71967 )
## 10) DiscMM < 0.47 220 270.500 MM ( 0.30455 0.69545 ) *
## 11) DiscMM > 0.47 19 0.000 MM ( 0.00000 1.00000 ) *
## 3) LoyalCH > 0.48285 510 466.000 CH ( 0.82941 0.17059 )
## 6) LoyalCH < 0.764572 245 300.200 CH ( 0.69796 0.30204 )
## 12) PriceDiff < 0.145 99 137.000 MM ( 0.47475 0.52525 )
## 24) DiscMM < 0.47 82 112.900 CH ( 0.54878 0.45122 ) *
## 25) DiscMM > 0.47 17 12.320 MM ( 0.11765 0.88235 ) *
## 13) PriceDiff > 0.145 146 123.800 CH ( 0.84932 0.15068 ) *
## 7) LoyalCH > 0.764572 265 103.700 CH ( 0.95094 0.04906 ) *
par(bg = "violet")
plot(tree.oj)
text(tree.oj, pretty = 0)
tree.preds<-predict(tree.oj, oj.te, type = 'class')
obs.purch<-oj.te$Purchase
caret::confusionMatrix(tree.preds, obs.purch)
## Confusion Matrix and Statistics
##
## Reference
## Prediction CH MM
## CH 135 20
## MM 27 88
##
## Accuracy : 0.8259
## 95% CI : (0.7753, 0.8692)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 9.992e-16
##
## Kappa : 0.6412
##
## Mcnemar's Test P-Value : 0.3815
##
## Sensitivity : 0.8333
## Specificity : 0.8148
## Pos Pred Value : 0.8710
## Neg Pred Value : 0.7652
## Prevalence : 0.6000
## Detection Rate : 0.5000
## Detection Prevalence : 0.5741
## Balanced Accuracy : 0.8241
##
## 'Positive' Class : CH
##
We get an error rate of 17.41%
cv.oj<-cv.tree(tree.oj, FUN = prune.misclass)
par(bg = "yellow")
plot(cv.oj$size,cv.oj$dev/800, type ="b", xlab = "Size", ylab = 'CV Error Rate', main = 'Lowest CV Error Rate by Size')
best.trees<-data.frame(tree_size = cv.oj$size, CvErrors = cv.oj$dev, Rate = paste0(cv.oj$dev/8,"%"))
best.trees[order(best.trees$Rate),]
## tree_size CvErrors Rate
## 1 7 157 19.625%
## 2 5 157 19.625%
## 3 2 161 20.125%
## 4 1 309 38.625%
We can see in the table above that the tree size of 7 and 5 give us the lowest CV error rate of 19.625%
prune.oj<-prune.misclass(tree.oj, best = 5)
prune.predz<-predict(prune.oj, oj.te, type = "class")
caret::confusionMatrix(prune.predz, obs.purch)
## Confusion Matrix and Statistics
##
## Reference
## Prediction CH MM
## CH 135 20
## MM 27 88
##
## Accuracy : 0.8259
## 95% CI : (0.7753, 0.8692)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 9.992e-16
##
## Kappa : 0.6412
##
## Mcnemar's Test P-Value : 0.3815
##
## Sensitivity : 0.8333
## Specificity : 0.8148
## Pos Pred Value : 0.8710
## Neg Pred Value : 0.7652
## Prevalence : 0.6000
## Detection Rate : 0.5000
## Detection Prevalence : 0.5741
## Balanced Accuracy : 0.8241
##
## 'Positive' Class : CH
##
best = 5 17.41% best = 7 17.41% Both of the trees with 5 and 7 terminal nodes have the same error rate which is the lowest compared to the others.