Consider the Gini index, classification error, and entropy in a simple classification setting with two classes. Create a single plot that displays each of these quantities as a function of ˆpm1. The xaxis should display ˆpm1, ranging from 0 to 1, and the y-axis should display the value of the Gini index, classification error, and entropy. Hint: In a setting with two classes, pˆm1 = 1 − pˆm2. You could make this plot by hand, but it will be much easier to make in R.
p = seq(0, 1, 0.01)
gini.index <- p * (1 - p) * 2
c.error <- 1 - pmax(p, 1 - p)
c.entropy <- - (p * log(p) + (1 - p) * log(1 - p))
plot(c.error, ylim = c(0, 1), col = "#FF1493", ylab = "C.Error, C.Entropy, Gini.Index", xlab = "P")
lines(gini.index, lwd = 2, col = "#9932CC")
lines(c.entropy, lwd = 3, col = "#1E90FF")
legend('top', inset = .01, legend = c('Gini index', 'Classification error', 'Cross entropy'), col = c("#9932CC", "#FF1493", "#1E90FF"), pch = c(16,16,16))
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.5
data <- Carseats
sample_size <- floor(0.70* nrow(Carseats))
set.seed(1)
train_index <- sample(seq_len(nrow(Carseats)), size = sample_size)
train <- Carseats[train_index, ]
test <- Carseats[-train_index, ]
library(tree)
## Warning: package 'tree' was built under R version 4.0.5
set.seed(1)
tree.fit <- tree(Sales ~ ., data = train)
summary(tree.fit)
##
## Regression tree:
## tree(formula = Sales ~ ., data = train)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Age" "Income" "CompPrice"
## [6] "Advertising"
## Number of terminal nodes: 18
## Residual mean deviance: 2.409 = 631.1 / 262
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.77800 -0.96100 -0.08865 0.00000 1.01800 4.14100
set.seed(1)
tree.pred <- predict(tree.fit, newdata = test)
mean((tree.pred - test$Sales)^2)
## [1] 4.208383
plot(tree.fit)
text(tree.fit, pretty = 0, cex=.8, font = 0.5)
set.seed(1)
tree.cv = cv.tree(tree.fit)
plot(tree.cv$size, tree.cv$dev, type = "b")
tree.min <- which.min(tree.cv$dev)
points(tree.min, tree.cv$dev[tree.min], cex = 2, col = "red", pch = 20)
set.seed
## function (seed, kind = NULL, normal.kind = NULL, sample.kind = NULL)
## {
## kinds <- c("Wichmann-Hill", "Marsaglia-Multicarry", "Super-Duper",
## "Mersenne-Twister", "Knuth-TAOCP", "user-supplied", "Knuth-TAOCP-2002",
## "L'Ecuyer-CMRG", "default")
## n.kinds <- c("Buggy Kinderman-Ramage", "Ahrens-Dieter", "Box-Muller",
## "user-supplied", "Inversion", "Kinderman-Ramage", "default")
## s.kinds <- c("Rounding", "Rejection", "default")
## if (length(kind)) {
## if (!is.character(kind) || length(kind) > 1L)
## stop("'kind' must be a character string of length 1 (RNG to be used).")
## if (is.na(i.knd <- pmatch(kind, kinds) - 1L))
## stop(gettextf("'%s' is not a valid abbreviation of an RNG",
## kind), domain = NA)
## if (i.knd == length(kinds) - 1L)
## i.knd <- -1L
## }
## else i.knd <- NULL
## if (!is.null(normal.kind)) {
## if (!is.character(normal.kind) || length(normal.kind) !=
## 1L)
## stop("'normal.kind' must be a character string of length 1")
## normal.kind <- pmatch(normal.kind, n.kinds) - 1L
## if (is.na(normal.kind))
## stop(gettextf("'%s' is not a valid choice", normal.kind),
## domain = NA)
## if (normal.kind == 0L)
## stop("buggy version of Kinderman-Ramage generator is not allowed",
## domain = NA)
## if (normal.kind == length(n.kinds) - 1L)
## normal.kind <- -1L
## }
## if (!is.null(sample.kind)) {
## if (!is.character(sample.kind) || length(sample.kind) !=
## 1L)
## stop("'sample.kind' must be a character string of length 1")
## sample.kind <- pmatch(sample.kind, s.kinds) - 1L
## if (is.na(sample.kind))
## stop(gettextf("'%s' is not a valid choice", sample.kind),
## domain = NA)
## if (sample.kind == 0L)
## warning("non-uniform 'Rounding' sampler used", domain = NA)
## if (sample.kind == length(s.kinds) - 1L)
## sample.kind <- -1L
## }
## .Internal(set.seed(seed, i.knd, normal.kind, sample.kind))
## }
## <bytecode: 0x0000000013e680a0>
## <environment: namespace:base>
pruned.tree <- prune.tree(tree.fit, best = 11)
pruned.pred <- predict(pruned.tree, newdata = test)
mean((pruned.pred - test$Sales)^2)
## [1] 4.691032
The error rate did did not improved.
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
set.seed(1)
bag.car <- randomForest(Sales ~ ., data = train, mtry = 10, importance = TRUE )
bag.pred <- predict(bag.car, newdata = test)
mean((bag.pred - test$Sales)^2)
## [1] 2.573252
importance(bag.car)
## %IncMSE IncNodePurity
## CompPrice 35.324343 230.55353
## Income 7.923790 118.79213
## Advertising 19.736816 155.03099
## Population -3.479681 63.11975
## Price 70.613500 673.11982
## ShelveLoc 68.147204 637.69500
## Age 20.964215 228.90345
## Education 4.705263 63.13124
## Urban -2.098091 11.66651
## US 1.389570 11.15329
Test error rate is 2.57, the variables of importance are = Price, ShelveLoc, and CompPrice.
set.seed(1)
random.car5 = randomForest(Sales ~ ., data = train, mtry = 5, ntree = 500, importance = T)
random.pred.car5 = predict(random.car5, test)
mean((test$Sales - random.pred.car5)^2)
## [1] 2.60463
random.car6 = randomForest(Sales ~ ., data = train, mtry = 6, ntree = 500, importance = T)
random.pred.car6 = predict(random.car6, test)
mean((test$Sales - random.pred.car6)^2)
## [1] 2.551024
random.car7 = randomForest(Sales ~ ., data = train, mtry = 7, ntree = 500, importance = T)
random.pred.car7 = predict(random.car7, test)
mean((test$Sales - random.pred.car7)^2)
## [1] 2.565275
random.car8 = randomForest(Sales ~ ., data = train, mtry = 8, ntree = 500, importance = T)
random.pred.car8 = predict(random.car8, test)
mean((test$Sales - random.pred.car8)^2)
## [1] 2.567522
random.car9 = randomForest(Sales ~ ., data = train, mtry = 9, ntree = 500, importance = T)
random.pred.car9 = predict(random.car9, test)
mean((test$Sales - random.pred.car9)^2)
## [1] 2.609295
random.car10 = randomForest(Sales ~ ., data = train, mtry = 10, ntree = 500, importance = T)
random.pred.car10 = predict(random.car10, test)
mean((test$Sales - random.pred.car10)^2)
## [1] 2.598445
random.car1 = randomForest(Sales ~ ., data = train, mtry = 1, ntree = 500, importance = T)
random.pred.car1 = predict(random.car1, test)
mean((test$Sales - random.pred.car1)^2)
## [1] 4.557115
random.car2 = randomForest(Sales ~ ., data = train, mtry = 2, ntree = 500, importance = T)
random.pred.car2 = predict(random.car2, test)
mean((test$Sales - random.pred.car2)^2)
## [1] 3.315914
random.car3 = randomForest(Sales ~ ., data = train, mtry = 3, ntree = 500, importance = T)
random.pred.car3 = predict(random.car3, test)
mean((test$Sales - random.pred.car3)^2)
## [1] 2.894646
random.car4 = randomForest(Sales ~ ., data = train, mtry = 4, ntree = 500, importance = T)
random.pred.car4 = predict(random.car4, test)
mean((test$Sales - random.pred.car4)^2)
## [1] 2.672202
The mtry equal to 6 has the least error rate in compared to the others.
importance(random.car6)
## %IncMSE IncNodePurity
## CompPrice 27.143517 219.92968
## Income 4.921334 130.21447
## Advertising 17.294698 155.03223
## Population -1.667603 85.59735
## Price 60.661862 646.00362
## ShelveLoc 63.328288 590.63510
## Age 22.179583 252.02070
## Education 1.416713 74.92792
## Urban -1.338920 13.06650
## US 4.552794 20.63057
The top 3 important variables are = Price, CompPrice and ShelveLoc. Which are the same as previous.
library(ISLR)
data <- OJ
set.seed(1)
sample_oj <- sample(nrow(OJ), 800)
train.oj <- OJ[sample_oj, ]
test.oj <- OJ[-sample_oj, ]
library(tree)
set.seed(1)
tree.fit.oj <- tree(Purchase ~ ., data = train.oj)
summary(tree.fit.oj)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = train.oj)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "SpecialCH" "ListPriceDiff"
## [5] "PctDiscMM"
## Number of terminal nodes: 9
## Residual mean deviance: 0.7432 = 587.8 / 791
## Misclassification error rate: 0.1588 = 127 / 800
Test error rate is 0.1588 and our tree has 9 terminal nodes. The variables used = [1] “LoyalCH” “PriceDiff” “SpecialCH” “ListPriceDiff” “PctDiscMM” .
tree.fit.oj
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1073.00 CH ( 0.60625 0.39375 )
## 2) LoyalCH < 0.5036 365 441.60 MM ( 0.29315 0.70685 )
## 4) LoyalCH < 0.280875 177 140.50 MM ( 0.13559 0.86441 )
## 8) LoyalCH < 0.0356415 59 10.14 MM ( 0.01695 0.98305 ) *
## 9) LoyalCH > 0.0356415 118 116.40 MM ( 0.19492 0.80508 ) *
## 5) LoyalCH > 0.280875 188 258.00 MM ( 0.44149 0.55851 )
## 10) PriceDiff < 0.05 79 84.79 MM ( 0.22785 0.77215 )
## 20) SpecialCH < 0.5 64 51.98 MM ( 0.14062 0.85938 ) *
## 21) SpecialCH > 0.5 15 20.19 CH ( 0.60000 0.40000 ) *
## 11) PriceDiff > 0.05 109 147.00 CH ( 0.59633 0.40367 ) *
## 3) LoyalCH > 0.5036 435 337.90 CH ( 0.86897 0.13103 )
## 6) LoyalCH < 0.764572 174 201.00 CH ( 0.73563 0.26437 )
## 12) ListPriceDiff < 0.235 72 99.81 MM ( 0.50000 0.50000 )
## 24) PctDiscMM < 0.196197 55 73.14 CH ( 0.61818 0.38182 ) *
## 25) PctDiscMM > 0.196197 17 12.32 MM ( 0.11765 0.88235 ) *
## 13) ListPriceDiff > 0.235 102 65.43 CH ( 0.90196 0.09804 ) *
## 7) LoyalCH > 0.764572 261 91.20 CH ( 0.95785 0.04215 ) *
We are selecting node 25 PctDiscMM, the node splits for then PctDiscMM > 0.196197. There are 17 observations in the leaf with the residual variance of 12.32. The prediction is MM with 0.11765 (11.765%) taking MM and 0.88235 (88.235%) taking CH.
plot(tree.fit.oj)
text(tree.fit.oj, pretty=0, , cex=.8, font = 0.5)
set.seed(1)
tree.pred.oj <- predict(tree.fit.oj, test.oj, type = 'class')
table(test.oj$Purchase, tree.pred.oj)
## tree.pred.oj
## CH MM
## CH 160 8
## MM 38 64
caret::confusionMatrix(tree.pred.oj, test.oj$Purchase)
## Confusion Matrix and Statistics
##
## Reference
## Prediction CH MM
## CH 160 38
## MM 8 64
##
## Accuracy : 0.8296
## 95% CI : (0.7794, 0.8725)
## No Information Rate : 0.6222
## P-Value [Acc > NIR] : 8.077e-14
##
## Kappa : 0.6154
##
## Mcnemar's Test P-Value : 1.904e-05
##
## Sensitivity : 0.9524
## Specificity : 0.6275
## Pos Pred Value : 0.8081
## Neg Pred Value : 0.8889
## Prevalence : 0.6222
## Detection Rate : 0.5926
## Detection Prevalence : 0.7333
## Balanced Accuracy : 0.7899
##
## 'Positive' Class : CH
##
The error rate is 17.037% = (8+38)/(160+64+8+38)
set.seed(1)
oj.tree.cv = cv.tree(tree.fit.oj, FUN = prune.misclass)
oj.tree.cv
## $size
## [1] 9 8 7 4 2 1
##
## $dev
## [1] 145 145 146 146 167 315
##
## $k
## [1] -Inf 0.000000 3.000000 4.333333 10.500000 151.000000
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(oj.tree.cv$size, oj.tree.cv$dev, type="b")
8 and 9 have the lowest cross-validation error rate with < 150 cross validation errors.
set.seed(1)
pruned.fit.oj = prune.misclass(tree.fit.oj, best=4)
summary(pruned.fit.oj)
##
## Classification tree:
## snip.tree(tree = tree.fit.oj, nodes = c(4L, 10L, 3L))
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff"
## Number of terminal nodes: 4
## Residual mean deviance: 0.8922 = 710.2 / 796
## Misclassification error rate: 0.1788 = 143 / 800
The pruned tree error rate is 17.88% which is higher to our 15.88% unpruned tree error rate.
set.seed(1)
pruned.pred.oj <- predict(pruned.fit.oj, test.oj, type = 'class')
caret::confusionMatrix(pruned.pred.oj, test.oj$Purchase)
## Confusion Matrix and Statistics
##
## Reference
## Prediction CH MM
## CH 161 41
## MM 7 61
##
## Accuracy : 0.8222
## 95% CI : (0.7713, 0.8659)
## No Information Rate : 0.6222
## P-Value [Acc > NIR] : 6.769e-13
##
## Kappa : 0.5954
##
## Mcnemar's Test P-Value : 1.906e-06
##
## Sensitivity : 0.9583
## Specificity : 0.5980
## Pos Pred Value : 0.7970
## Neg Pred Value : 0.8971
## Prevalence : 0.6222
## Detection Rate : 0.5963
## Detection Prevalence : 0.7481
## Balanced Accuracy : 0.7782
##
## 'Positive' Class : CH
##
So the pruned tree has a test error rate of 17.78% (41+7)/(41+7+161+61) which is it higher than our un-pruned tree test error rate that is 17.037% = (8+38)/(160+64+8+38).