p = seq(0, 1, 0.01)
gini = p * (1 - p) * 2
entropy = -(p * log(p) + (1 - p) * log(1 - p))
class.err = 1 - pmax(p, 1 - p)
matplot(p, cbind(gini, entropy, class.err), col = c("pink", "red", "purple"))
data(Carseats)
## Converting Sales into a Qualitative Response Variable
# Yes if Sales > 8 and No otherwise
Carseats$Sales=as.factor(ifelse(Carseats$Sales > 8,"Yes","No"))
inTrain = createDataPartition(y=Carseats$Sales,p=0.75,list = FALSE)
train = Carseats[inTrain,]
test = Carseats[-inTrain,]
# fit a regression tree
tree.carseats=rpart(Sales ~., data=train, method="anova", control=rpart.control(minsplit=15, cp=0.01))
library(rattle)
fancyRpartPlot(tree.carseats, main = "Tree Plot of Carseats", sub="", cex=0.5, palettes=c("Blues"))
The most important indicator of Sales appears to be shelving location, since the first branch differentiates Good locations from Bad and Medium locations.
pred1=predict(tree.carseats, newdata=test)
TestMSE1=mse(test$Sales,pred1)
TestMSE1
## [1] 0.2144515
printcp(tree.carseats)
##
## Regression tree:
## rpart(formula = Sales ~ ., data = train, method = "anova", control = rpart.control(minsplit = 15,
## cp = 0.01))
##
## Variables actually used in tree construction:
## [1] Advertising Age CompPrice Income Price ShelveLoc
## [7] US
##
## Root node error: 72.57/300 = 0.2419
##
## n= 300
##
## CP nsplit rel error xerror xstd
## 1 0.151618 0 1.00000 1.01014 0.021501
## 2 0.081840 1 0.84838 0.86950 0.047181
## 3 0.071033 2 0.76654 0.90486 0.056551
## 4 0.051328 3 0.69551 0.87484 0.061682
## 5 0.034360 4 0.64418 0.82117 0.065181
## 6 0.033913 5 0.60982 0.84569 0.068591
## 7 0.029488 8 0.50808 0.81937 0.068484
## 8 0.027188 9 0.47859 0.81004 0.068948
## 9 0.024043 10 0.45141 0.80325 0.071034
## 10 0.023488 12 0.40332 0.79430 0.072405
## 11 0.018603 13 0.37983 0.78512 0.071709
## 12 0.011024 14 0.36123 0.77119 0.073577
## 13 0.010000 15 0.35021 0.77862 0.073729
plotcp(tree.carseats, col = 4, pch=16, cex=1)
tree.carseats$cptable[which.min(tree.carseats$cptable[,"xerror"]),"xerror"]
## [1] 0.7711895
The tree with 3 nodes seems to be the best. It has the lowest cross-validated error of all.
tree.carseats$cptable[which.min(tree.carseats$cptable[,"xerror"]),"CP"]
## [1] 0.01102384
carseats.prune=prune(tree.carseats,cp=tree.carseats$cptable[which.min(tree.carseats$cptable[,"xerror"]),"CP"])
fancyRpartPlot(carseats.prune, uniform=TRUE, main="Pruned Tree", sub="", palettes=c("Oranges"))
pred2=predict(carseats.prune, newdata=test)
TestMSE2=mse(test$Sales,pred2)
TestMSE2
## [1] 0.2168515
Pruning the tree improved the test MSE.
library(randomForest)
Carseats.bag = randomForest(Sales~.,data=train,mtry = 10,importance = TRUE)
pred3 = predict(Carseats.bag,newdata=test)
TestMSE3 = mse(test$Sales, pred3)
TestMSE3
## [1] 0.25
# important variables
importance(Carseats.bag)
## No Yes MeanDecreaseAccuracy MeanDecreaseGini
## CompPrice 7.8394961 10.2935689 12.83924013 16.2478329
## Income 3.3928095 2.7049537 4.14660222 11.6397918
## Advertising 14.5436419 16.7306440 21.61109016 20.1478430
## Population -0.9800967 -5.0741421 -4.01906273 8.1456945
## Price 31.1404945 33.7299992 43.33675112 38.6462901
## ShelveLoc 33.3149468 28.8125463 39.86600448 27.3276185
## Age 7.5351530 8.8450096 11.12405479 14.2695766
## Education 1.0633392 -1.0829264 0.08616897 5.2471448
## Urban 1.2487407 -0.6639458 0.41999398 0.7714795
## US 5.4930214 6.8601409 8.70186544 2.1589283
library(colorspace)
barplot(importance(Carseats.bag),axisnames = TRUE, col = rainbow_hcl(10), beside = TRUE, legend.text = rownames(importance(Carseats.bag)),border = TRUE, args.legend = list(x = "topright"), width=1, xlim = c(3, 52))
# fit random forest
Carseats.rf=train(Sales ~ .,data=train,method='rf',trControl = trainControl("cv", number = 10),importance = TRUE)
Carseats.rf
## Random Forest
##
## 300 samples
## 10 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 270, 270, 270, 270, 270, 271, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8002929 0.5710437
## 6 0.8001928 0.5764407
## 11 0.7834112 0.5423412
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
By default, 500 trees are trained. The optimal number of variables sampled at each split is 6 so m=6.
pred4=predict(Carseats.rf, newdata=test)
TestMSE4=mse(test$Sales,pred4)
TestMSE4
## [1] 0.21
# important variables
varImp(Carseats.rf)
## rf variable importance
##
## Importance
## ShelveLocGood 100.000
## Price 91.465
## Advertising 51.791
## ShelveLocMedium 32.042
## Age 25.728
## USYes 23.342
## CompPrice 14.377
## Education 12.188
## Income 10.536
## UrbanYes 8.718
## Population 0.000
plot(varImp(Carseats.rf))
The results indicate that across all of the trees considered in the random forest, the Price company charges for car seats at each site (Price) and the quality of the shelving location (ShelveLoc) are by far the two most important variables
library(ISLR)
data(OJ)
inTrain2 = sample(1070,800)
train2 = OJ[inTrain2,]
test2 = OJ[-inTrain2,]
# fit a tree
tree.OJ=rpart(Purchase ~., data=train2, method="class", control=rpart.control(minsplit=15, cp=0.01))
summary(tree.OJ, cp=1)
## Call:
## rpart(formula = Purchase ~ ., data = train2, method = "class",
## control = rpart.control(minsplit = 15, cp = 0.01))
## n= 800
##
## CP nsplit rel error xerror xstd
## 1 0.5096154 0 1.0000000 1.0000000 0.04421683
## 2 0.0224359 1 0.4903846 0.5128205 0.03626189
## 3 0.0100000 4 0.4230769 0.5064103 0.03609080
##
## Variable importance
## LoyalCH ListPriceDiff PriceDiff SalePriceMM PriceMM
## 60 10 6 5 5
## PriceCH SalePriceCH StoreID WeekofPurchase PctDiscMM
## 3 2 2 2 1
## DiscMM SpecialMM
## 1 1
##
## Node number 1: 800 observations
## predicted class=CH expected loss=0.39 P(node) =1
## class counts: 488 312
## probabilities: 0.610 0.390
It has 4 terminal nodes.
The most important indicator of Purchase appears to be Customer brand loyalty for Citrus Hill(LoyalCH).
pred_mat1=predict(tree.OJ,train2, type = "class")
TrainErr = mean(pred_mat1 != train2$Purchase)
TrainErr
## [1] 0.165
tree.OJ
## n= 800
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 312 CH (0.61000000 0.39000000)
## 2) LoyalCH>=0.48285 497 81 CH (0.83702213 0.16297787)
## 4) LoyalCH>=0.740621 264 11 CH (0.95833333 0.04166667) *
## 5) LoyalCH< 0.740621 233 70 CH (0.69957082 0.30042918)
## 10) ListPriceDiff>=0.235 139 19 CH (0.86330935 0.13669065) *
## 11) ListPriceDiff< 0.235 94 43 MM (0.45744681 0.54255319)
## 22) PriceDiff>=0.015 43 15 CH (0.65116279 0.34883721) *
## 23) PriceDiff< 0.015 51 15 MM (0.29411765 0.70588235) *
## 3) LoyalCH< 0.48285 303 72 MM (0.23762376 0.76237624) *
library(rattle)
fancyRpartPlot(tree.OJ, main = "Tree Plot of Orange Juice Data", sub="", cex=0.6, palettes=c("Blues"))
The most important indicator of Purchase appears to be Customer brand loyalty for Citrus Hill(LoyalCH), since the first branch differentiates from LoyalCH >=0.48.
pred_mat2=predict(tree.OJ, test2, type = "class")
# confusion matrix
table(pred_mat2, test2$Purchase)
##
## pred_mat2 CH MM
## CH 139 21
## MM 26 84
TestErr = mean(pred_mat2 != test2$Purchase)
TestErr
## [1] 0.1740741
library(tree)
tree.cv.OJ=cv.tree(tree(Purchase ~ ., data = train2), FUN = prune.tree, K = 10)
tree.cv.OJ
## $size
## [1] 7 6 5 4 3 2 1
##
## $dev
## [1] 681.6301 688.3083 693.0921 695.5025 728.5934 793.6519 1071.8759
##
## $k
## [1] -Inf 12.22040 13.79713 33.47481 44.30497 65.62322 295.80418
##
## $method
## [1] "deviance"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(tree.cv.OJ$size, tree.cv.OJ$dev, type = "b", xlab = "Tree size", ylab = "Cross Validation Error", col="purple", pch=16)
tree.cv.OJ$dev[5]
## [1] 728.5934
Tree Size of 5 has the lowest cross-validated classification error rate.
OJ.prune = prune.misclass(tree(Purchase ~ ., data = train2), best = 5)
plot(OJ.prune, main="Pruned Tree", col = "limegreen")
text(OJ.prune, pretty=0, col = "darkgreen")
pred_mat3=predict(OJ.prune,train2, type = "class")
TrainErr2 = mean(pred_mat3 != train2$Purchase)
TrainErr2
## [1] 0.165
TrainErr
## [1] 0.165
The training error rate of unpruned tree is higher.
pred_mat4=predict(OJ.prune,test2, type = "class")
TestErr2 = mean(pred_mat4 != test2$Purchase)
TestErr2
## [1] 0.1740741
TestErr
## [1] 0.1740741
The test error rate of pruned tree is higher.