###:Consider the Gini index, classification error, and entropy in a simple classification setting with two classes. Create a single plot that displays each of these quantities as a function of ˆ pm1. Thexaxis should display ˆ pm1, ranging from 0 to 1, and the y-axis should display the value of the Gini index, classification error, and entropy. Hint: In a setting with two classes, ˆ pm1 =1− ˆ pm2. You could make this plot by hand, but it will be much easier to make in R.
p <- seq(0, 1, 0.01)
gini.index <- 2 * p * (1 - p)
class.error <- 1 - pmax(p, 1 - p)
cross.entropy <- - (p * log(p) + (1 - p) * log(1 - p))
par(bg = "lightcyan")
matplot(p, cbind(gini.index, class.error, cross.entropy), pch=c(15,17,19) ,ylab = "gini.index, class.error, cross.entropy",col = c("gray67" , "lightpink2", "goldenrod2"), type = 'b')
legend('bottom', inset=.01, legend = c('gini.index', 'class.error', 'cross.entropy'), col = c("gray67" , "lightpink2", "goldenrod2"), pch=c(15,17,19))
###This problem involves the OJ data set which is part of the ISLR package.
set.seed(10)
library(ISLR)
library(tree)
samp<-sample(1:nrow(OJ), 800)
oj.tr<-OJ[samp,]
oj.te<-OJ[-samp,]
tree.oj<-tree(Purchase~., data = oj.tr)
summary(tree.oj)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = oj.tr)
## Variables actually used in tree construction:
## [1] "LoyalCH" "DiscMM" "PriceDiff"
## Number of terminal nodes: 7
## Residual mean deviance: 0.7983 = 633 / 793
## Misclassification error rate: 0.1775 = 142 / 800
tree.oj
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1067.000 CH ( 0.61375 0.38625 )
## 2) LoyalCH < 0.48285 290 315.900 MM ( 0.23448 0.76552 )
## 4) LoyalCH < 0.035047 51 9.844 MM ( 0.01961 0.98039 ) *
## 5) LoyalCH > 0.035047 239 283.600 MM ( 0.28033 0.71967 )
## 10) DiscMM < 0.47 220 270.500 MM ( 0.30455 0.69545 ) *
## 11) DiscMM > 0.47 19 0.000 MM ( 0.00000 1.00000 ) *
## 3) LoyalCH > 0.48285 510 466.000 CH ( 0.82941 0.17059 )
## 6) LoyalCH < 0.764572 245 300.200 CH ( 0.69796 0.30204 )
## 12) PriceDiff < 0.145 99 137.000 MM ( 0.47475 0.52525 )
## 24) DiscMM < 0.47 82 112.900 CH ( 0.54878 0.45122 ) *
## 25) DiscMM > 0.47 17 12.320 MM ( 0.11765 0.88235 ) *
## 13) PriceDiff > 0.145 146 123.800 CH ( 0.84932 0.15068 ) *
## 7) LoyalCH > 0.764572 265 103.700 CH ( 0.95094 0.04906 ) *
par(bg = "lightpink")
plot(tree.oj)
text(tree.oj, pretty = 0)
tree.predz<-predict(tree.oj, oj.te, type = 'class')
obs.purch<-oj.te$Purchase
caret::confusionMatrix(tree.predz, obs.purch)
## Confusion Matrix and Statistics
##
## Reference
## Prediction CH MM
## CH 135 20
## MM 27 88
##
## Accuracy : 0.8259
## 95% CI : (0.7753, 0.8692)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 9.992e-16
##
## Kappa : 0.6412
##
## Mcnemar's Test P-Value : 0.3815
##
## Sensitivity : 0.8333
## Specificity : 0.8148
## Pos Pred Value : 0.8710
## Neg Pred Value : 0.7652
## Prevalence : 0.6000
## Detection Rate : 0.5000
## Detection Prevalence : 0.5741
## Balanced Accuracy : 0.8241
##
## 'Positive' Class : CH
##
cv.oj<-cv.tree(tree.oj, FUN = prune.misclass)
plot(cv.oj$size,cv.oj$dev/800, type ="b", xlab = "tree size", ylab = 'CV Error Rate', main = 'What size tree has\n the lowest CV Error Rate?')
best.trees<-data.frame(tree_size = cv.oj$size, CvErrors = cv.oj$dev, Rate = paste0(cv.oj$dev/8,"%"))
best.trees[order(best.trees$Rate),]
## tree_size CvErrors Rate
## 1 7 157 19.625%
## 2 5 157 19.625%
## 3 2 161 20.125%
## 4 1 309 38.625%
prune.oj<-prune.misclass(tree.oj, best = 5)
prune.predz<-predict(prune.oj, oj.te, type = "class")
caret::confusionMatrix(prune.predz, obs.purch)
## Confusion Matrix and Statistics
##
## Reference
## Prediction CH MM
## CH 135 20
## MM 27 88
##
## Accuracy : 0.8259
## 95% CI : (0.7753, 0.8692)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 9.992e-16
##
## Kappa : 0.6412
##
## Mcnemar's Test P-Value : 0.3815
##
## Sensitivity : 0.8333
## Specificity : 0.8148
## Pos Pred Value : 0.8710
## Neg Pred Value : 0.7652
## Prevalence : 0.6000
## Detection Rate : 0.5000
## Detection Prevalence : 0.5741
## Balanced Accuracy : 0.8241
##
## 'Positive' Class : CH
##