library(caret)
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.3
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.6.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages ------------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.1 v dplyr 0.8.5
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## v purrr 0.3.4
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'purrr' was built under R version 3.6.3
## Warning: package 'dplyr' was built under R version 3.6.3
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts ---------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
library(rattle)
## Warning: package 'rattle' was built under R version 3.6.2
## Rattle: A free graphical interface for data science with R.
## Version 5.3.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart)
str(OJ)
## 'data.frame': 1070 obs. of 18 variables:
## $ Purchase : Factor w/ 2 levels "CH","MM": 1 1 1 2 1 1 1 1 1 1 ...
## $ WeekofPurchase: num 237 239 245 227 228 230 232 234 235 238 ...
## $ StoreID : num 1 1 1 1 7 7 7 7 7 7 ...
## $ PriceCH : num 1.75 1.75 1.86 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
## $ PriceMM : num 1.99 1.99 2.09 1.69 1.69 1.99 1.99 1.99 1.99 1.99 ...
## $ DiscCH : num 0 0 0.17 0 0 0 0 0 0 0 ...
## $ DiscMM : num 0 0.3 0 0 0 0 0.4 0.4 0.4 0.4 ...
## $ SpecialCH : num 0 0 0 0 0 0 1 1 0 0 ...
## $ SpecialMM : num 0 1 0 0 0 1 1 0 0 0 ...
## $ LoyalCH : num 0.5 0.6 0.68 0.4 0.957 ...
## $ SalePriceMM : num 1.99 1.69 2.09 1.69 1.69 1.99 1.59 1.59 1.59 1.59 ...
## $ SalePriceCH : num 1.75 1.75 1.69 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
## $ PriceDiff : num 0.24 -0.06 0.4 0 0 0.3 -0.1 -0.16 -0.16 -0.16 ...
## $ Store7 : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 2 2 2 2 2 ...
## $ PctDiscMM : num 0 0.151 0 0 0 ...
## $ PctDiscCH : num 0 0 0.0914 0 0 ...
## $ ListPriceDiff : num 0.24 0.24 0.23 0 0 0.3 0.3 0.24 0.24 0.24 ...
## $ STORE : num 1 1 1 1 0 0 0 0 0 0 ...
inTrain<-createDataPartition(y=OJ$Purchase, p=800/nrow(OJ),list=FALSE)
train<-OJ[inTrain,]
test<-OJ[-inTrain,]
summary()
function to produce summary statistics about the tree, and describe the results obtained. What is the training error rate? How many terminal nodes does the tree have?tree_oj<-rpart(Purchase~.,data=train,method='class')
#names(tree_oj)
tree_oj
## n= 801
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 801 312 CH (0.61048689 0.38951311)
## 2) LoyalCH>=0.48285 498 80 CH (0.83935743 0.16064257)
## 4) LoyalCH>=0.7645725 258 12 CH (0.95348837 0.04651163) *
## 5) LoyalCH< 0.7645725 240 68 CH (0.71666667 0.28333333)
## 10) PriceDiff>=0.015 160 26 CH (0.83750000 0.16250000) *
## 11) PriceDiff< 0.015 80 38 MM (0.47500000 0.52500000)
## 22) ListPriceDiff>=0.235 27 6 CH (0.77777778 0.22222222) *
## 23) ListPriceDiff< 0.235 53 17 MM (0.32075472 0.67924528)
## 46) STORE>=3.5 9 2 CH (0.77777778 0.22222222) *
## 47) STORE< 3.5 44 10 MM (0.22727273 0.77272727) *
## 3) LoyalCH< 0.48285 303 71 MM (0.23432343 0.76567657)
## 6) LoyalCH>=0.280875 134 50 MM (0.37313433 0.62686567)
## 12) SalePriceMM>=2.04 61 28 CH (0.54098361 0.45901639)
## 24) SalePriceCH< 1.94 50 20 CH (0.60000000 0.40000000) *
## 25) SalePriceCH>=1.94 11 3 MM (0.27272727 0.72727273) *
## 13) SalePriceMM< 2.04 73 17 MM (0.23287671 0.76712329) *
## 7) LoyalCH< 0.280875 169 21 MM (0.12426036 0.87573964) *
Let’s pick 7) LoyalCH< 0.2761415 172 22 MM (0.1279070 0.8720930) *
:
At this terminal node we see that the observations split for CH Loyalty
scores of 0.2761415. One-hundred seventy-two observations are in this node and the prediction is MM
. Eighty-seven percent of the observations chose MM
and twleve percent chose CH
.
fancyRpartPlot(tree_oj)
tree_pred=predict(tree_oj,test,type="class")
confusionMatrix(test$Purchase,tree_pred)
## Confusion Matrix and Statistics
##
## Reference
## Prediction CH MM
## CH 147 17
## MM 26 79
##
## Accuracy : 0.8401
## 95% CI : (0.7908, 0.8818)
## No Information Rate : 0.6431
## P-Value [Acc > NIR] : 5.859e-13
##
## Kappa : 0.6589
##
## Mcnemar's Test P-Value : 0.2225
##
## Sensitivity : 0.8497
## Specificity : 0.8229
## Pos Pred Value : 0.8963
## Neg Pred Value : 0.7524
## Prevalence : 0.6431
## Detection Rate : 0.5465
## Detection Prevalence : 0.6097
## Balanced Accuracy : 0.8363
##
## 'Positive' Class : CH
##
cv.tree()
function to the training set in order to determine the optimal tree size.train_control <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
prune_oj<-train(Purchase~.,data=train,method='rpart',trControl=train_control)
plot(prune_oj)
#names(prune_oj)
prune_oj
## CART
##
## 801 samples
## 17 predictor
## 2 classes: 'CH', 'MM'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 722, 721, 721, 721, 721, 721, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.01602564 0.8064935 0.5941041
## 0.02029915 0.8023058 0.5817085
## 0.51602564 0.7045821 0.2970684
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.01602564.
prune_oj$bestTune
## cp
## 1 0.01602564
Accuracy was used to select the optimal model using the largest value. The final value used for the model was cp = 0.01923077.
See part (f)
full_pred<-predict(tree_oj,train,type="class")
prune_pred<-predict(prune_oj,train)
confusionMatrix(train$Purchase,full_pred)
## Confusion Matrix and Statistics
##
## Reference
## Prediction CH MM
## CH 438 51
## MM 66 246
##
## Accuracy : 0.8539
## 95% CI : (0.8276, 0.8777)
## No Information Rate : 0.6292
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6902
##
## Mcnemar's Test P-Value : 0.1956
##
## Sensitivity : 0.8690
## Specificity : 0.8283
## Pos Pred Value : 0.8957
## Neg Pred Value : 0.7885
## Prevalence : 0.6292
## Detection Rate : 0.5468
## Detection Prevalence : 0.6105
## Balanced Accuracy : 0.8487
##
## 'Positive' Class : CH
##
confusionMatrix(train$Purchase,prune_pred)
## Confusion Matrix and Statistics
##
## Reference
## Prediction CH MM
## CH 401 88
## MM 44 268
##
## Accuracy : 0.8352
## 95% CI : (0.8077, 0.8603)
## No Information Rate : 0.5556
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6621
##
## Mcnemar's Test P-Value : 0.0001821
##
## Sensitivity : 0.9011
## Specificity : 0.7528
## Pos Pred Value : 0.8200
## Neg Pred Value : 0.8590
## Prevalence : 0.5556
## Detection Rate : 0.5006
## Detection Prevalence : 0.6105
## Balanced Accuracy : 0.8270
##
## 'Positive' Class : CH
##
full_pred<-predict(tree_oj,test,type="class")
prune_pred<-predict(prune_oj,test)
confusionMatrix(test$Purchase,full_pred)
## Confusion Matrix and Statistics
##
## Reference
## Prediction CH MM
## CH 147 17
## MM 26 79
##
## Accuracy : 0.8401
## 95% CI : (0.7908, 0.8818)
## No Information Rate : 0.6431
## P-Value [Acc > NIR] : 5.859e-13
##
## Kappa : 0.6589
##
## Mcnemar's Test P-Value : 0.2225
##
## Sensitivity : 0.8497
## Specificity : 0.8229
## Pos Pred Value : 0.8963
## Neg Pred Value : 0.7524
## Prevalence : 0.6431
## Detection Rate : 0.5465
## Detection Prevalence : 0.6097
## Balanced Accuracy : 0.8363
##
## 'Positive' Class : CH
##
confusionMatrix(test$Purchase,prune_pred)
## Confusion Matrix and Statistics
##
## Reference
## Prediction CH MM
## CH 137 27
## MM 18 87
##
## Accuracy : 0.8327
## 95% CI : (0.7826, 0.8753)
## No Information Rate : 0.5762
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6539
##
## Mcnemar's Test P-Value : 0.233
##
## Sensitivity : 0.8839
## Specificity : 0.7632
## Pos Pred Value : 0.8354
## Neg Pred Value : 0.8286
## Prevalence : 0.5762
## Detection Rate : 0.5093
## Detection Prevalence : 0.6097
## Balanced Accuracy : 0.8235
##
## 'Positive' Class : CH
##