This a test of logistic regression and tree analysis on the OJ data, available in the ISLR library. Some of the comment are directly taken from the ISLR book.
The data contains 1070 purchases where the customer either purchased Citrus Hill or Minute Maid Orange Juice. A number of characteristics of the customer and product are recorded.
ds <- ISLR::OJ
pairs(ds[, -c(1,2,3,8,9,10,14,18)],
col= ds$Purchase,
pch= 21,
cex= 1,
panel = panel.smooth)
boxplot(data= ds, PriceDiff ~ Purchase, main= "Price difference by purchase")
# Purchase by store
table(ds$Purchase, ds$StoreID)
##
## 1 2 3 4 7
## CH 85 107 75 112 274
## MM 72 115 121 27 82
# week of purchase
eda_temp <- as.data.frame(table(ds$Purchase, ds$WeekofPurchase))
colnames(eda_temp) <- c("Purchase", "Week", "Freq")
ggplot(eda_temp, aes(x= Week, y= Freq, colour= Purchase, group= Purchase))+
geom_point()+
geom_line()+
theme_bw()+
theme(legend.position = "top", axis.text.x = element_text(angle = 90, vjust = 0.5))+
ggtitle("Number of purchase by week")
rm(eda_temp)
boxplot(data= ds, LoyalCH ~ Purchase, main= "Custom brand loyalty for CH by purchase")
ds$StoreID <- as.character(ds$StoreID)
ds$STORE <- as.character(ds$STORE)
ds$Purchase <- ifelse(ds$Purchase == "CH", 1, 0)
# removing columns which are duplicates or will bring singularities issues
ds <- ds[, -c(5, 11, 12, 14, 15, 16, 17, 18)]
# split test, validation and train
set.seed(19)
ds <- ds[sample(1:nrow(ds)), ]
train <- ds[1:535,]
val <- ds[536:803,]
test <- ds[804:1070,]
# simple algo
glm1 <- glm(family = "binomial", Purchase ~., data = train)
pred1 <- predict(glm1, val[,-1], type = "response")
result1 <- ifelse(pred1 > 0.5, 1, 0)
table(result1, val$Purchase)
##
## result1 0 1
## 0 73 18
## 1 33 144
# accuracy = (73 + 144) / 268 = 80.9%
# reg subset
reg.summary <- summary(regsubsets(Purchase ~ ., train, nvmax = 9))
print(reg.summary)
## Subset selection object
## Call: regsubsets.formula(Purchase ~ ., train, nvmax = 9)
## 12 Variables (and intercept)
## Forced in Forced out
## WeekofPurchase FALSE FALSE
## StoreID2 FALSE FALSE
## StoreID3 FALSE FALSE
## StoreID4 FALSE FALSE
## StoreID7 FALSE FALSE
## PriceCH FALSE FALSE
## DiscCH FALSE FALSE
## DiscMM FALSE FALSE
## SpecialCH FALSE FALSE
## SpecialMM FALSE FALSE
## LoyalCH FALSE FALSE
## PriceDiff FALSE FALSE
## 1 subsets of each size up to 9
## Selection Algorithm: exhaustive
## WeekofPurchase StoreID2 StoreID3 StoreID4 StoreID7 PriceCH DiscCH
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " "*" " " " "
## 5 ( 1 ) " " "*" "*" " " " " " " " "
## 6 ( 1 ) " " "*" "*" "*" " " " " " "
## 7 ( 1 ) " " "*" "*" "*" " " "*" " "
## 8 ( 1 ) "*" "*" "*" "*" " " " " " "
## 9 ( 1 ) "*" "*" "*" "*" " " " " " "
## DiscMM SpecialCH SpecialMM LoyalCH PriceDiff
## 1 ( 1 ) " " " " " " "*" " "
## 2 ( 1 ) " " " " " " "*" "*"
## 3 ( 1 ) " " " " " " "*" "*"
## 4 ( 1 ) " " " " "*" "*" "*"
## 5 ( 1 ) " " " " "*" "*" "*"
## 6 ( 1 ) " " " " "*" "*" "*"
## 7 ( 1 ) " " " " "*" "*" "*"
## 8 ( 1 ) "*" " " "*" "*" "*"
## 9 ( 1 ) "*" "*" "*" "*" "*"
# choosing the best number of variables
par(mfrow = c(2,2))
plot(reg.summary$rss, xlab = "number of variable", ylab = "RSS",
type = "l")
plot(reg.summary$adjr2, xlab = "number of variable", ylab = "Adjusted RSq",
type = "l")
points(6, reg.summary$adjr2[6], col= "red", cex= 2, pch= 20)
plot(reg.summary$cp, xlab = "number of variable", ylab = "Cp",
type = "l")
points(3, reg.summary$cp[3], col= "red", cex= 2, pch= 20)
plot(reg.summary$bic, xlab = "number of variable", ylab = "Bic",
type = "l")
points(2, reg.summary$bic[2], col= "red", cex= 2, pch= 20)
# we will choose 3 variables and calculate the accuracy. As one of the chosen variable is storeID2, we need to create a new column
train$StoreID2 <- ifelse(train$StoreID == "2", 1,0)
val$StoreID2 <- ifelse(val$StoreID == "2", 1,0)
test$StoreID2 <- ifelse(test$StoreID == "2", 1,0)
glm2 <- glm(family = "binomial", Purchase ~ LoyalCH + PriceDiff + StoreID2, data = train)
print(summary(glm2))
##
## Call:
## glm(formula = Purchase ~ LoyalCH + PriceDiff + StoreID2, family = "binomial",
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8268 -0.5590 0.2556 0.5817 2.6608
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.0495 0.3261 -9.352 < 2e-16 ***
## LoyalCH 6.3055 0.5455 11.560 < 2e-16 ***
## PriceDiff 2.9471 0.4808 6.130 8.8e-10 ***
## StoreID2 -0.4513 0.2755 -1.638 0.101
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 712.19 on 534 degrees of freedom
## Residual deviance: 425.57 on 531 degrees of freedom
## AIC: 433.57
##
## Number of Fisher Scoring iterations: 5
pred2 <- predict(glm2, type = "response", val[,-1])
result2 <- ifelse(pred2 > 0.5, 1, 0)
table(result2, val$Purchase)
##
## result2 0 1
## 0 73 19
## 1 33 143
# 73 + 143 / 268 = 80.5%
error_learning <- rep(0, length= 100)
for (i in 1:100) {
result <- ifelse(pred2 >= i * 0.01,1,0)
result <- sum((result - val$Purchase)^2)/321
error_learning[i] <- result
}
par(mfrow= c(1,1))
plot(x= seq(0.01,1,by= 0.01), y= error_learning, xlab = "Threshold",
main = "Error rate by threshold")
which.min(error_learning)
## [1] 66
points(66, error_learning[66], col= "red", cex= 2, pch= 20)
# new threshold 0.66
result2 <- ifelse(pred2 > 0.66,1,0)
table(result2, val$Purchase) # 82.4
##
## result2 0 1
## 0 87 28
## 1 19 134
error_val <- rep(0, dim(train)[1]-10)
error_train <- rep(0, dim(train)[1]-10)
for (i in 1:535) {
glm3 <- glm(family = "binomial", data = train[1:i, ],
Purchase ~ LoyalCH + PriceDiff + StoreID2)
predict1 <- predict(glm3, train[1:i, -1], type= "response")
result <- ifelse(predict1 > 0.66,1,0)
error_train[i-9] <- 1 - sum((result - train$Purchase[1:i])^2)/i
predict1 <- predict(glm3, val[, -1], type= "response")
result <- ifelse(predict1 > 0.66,1,0)
error_val[i-9] <- 1 - sum((result - val$Purchase)^2)/268
}
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
learning_curve <- data.frame(i = 10:535, train = error_train, val= error_val)
learning_curve <- gather(learning_curve, category, value, -i)
ggplot(learning_curve, aes(x= i, y= value, colour= category, group= category))+
geom_line(size= 1.2)+
theme_bw()+
ggtitle("Learning curve")+
xlab("number of training examples")
glm3 <- glm(family = "binomial", data = train,
Purchase ~ LoyalCH + PriceDiff + StoreID2)
predict1 <- predict(glm3, test[, -1], type = "response")
result <- ifelse(predict1 > 0.66, 1,0)
1 - sum((result - test$Purchase)^2)/267 # 81.27
## [1] 0.8127341
train$Purchase <- ifelse(train$Purchase == 1, "Yes", "No") # required in order to have a classification tree and not a regression tree
train$Purchase <- as.factor(train$Purchase)
val$Purchase <- ifelse(val$Purchase == 1, "Yes", "No")
val$Purchase <- as.factor(val$Purchase)
test$Purchase <- ifelse(test$Purchase == 1, "Yes", "No")
test$Purchase <- as.factor(test$Purchase)
# tree function
tree1 <- tree(Purchase ~., train)
summary(tree1)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff"
## Number of terminal nodes: 8
## Residual mean deviance: 0.7764 = 409.2 / 527
## Misclassification error rate: 0.1701 = 91 / 535
table(tree1$y, train$Purchase)
##
## No Yes
## No 205 0
## Yes 0 330
plot(tree1)
text(tree1, pretty = 0) # indicates to add category names
print(tree1) # show each branch of the tree with the split criterion, the number of observations in that branch, the deviance, the overall prediction for the branch and the fraction of observations in that branch that take on values of Yes and No. Branches that lead to terminal nodes are indicated with asterix
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 535 712.200 Yes ( 0.38318 0.61682 )
## 2) LoyalCH < 0.50395 235 287.900 No ( 0.69787 0.30213 )
## 4) LoyalCH < 0.382815 143 132.500 No ( 0.82517 0.17483 )
## 8) LoyalCH < 0.0320865 38 9.249 No ( 0.97368 0.02632 ) *
## 9) LoyalCH > 0.0320865 105 112.900 No ( 0.77143 0.22857 ) *
## 5) LoyalCH > 0.382815 92 127.500 Yes ( 0.50000 0.50000 )
## 10) PriceDiff < 0.235 45 52.190 No ( 0.73333 0.26667 ) *
## 11) PriceDiff > 0.235 47 55.430 Yes ( 0.27660 0.72340 ) *
## 3) LoyalCH > 0.50395 300 239.300 Yes ( 0.13667 0.86333 )
## 6) LoyalCH < 0.740621 117 137.300 Yes ( 0.27350 0.72650 )
## 12) PriceDiff < 0.25 63 86.560 Yes ( 0.44444 0.55556 ) *
## 13) PriceDiff > 0.25 54 28.520 Yes ( 0.07407 0.92593 ) *
## 7) LoyalCH > 0.740621 183 71.770 Yes ( 0.04918 0.95082 )
## 14) PriceDiff < -0.39 10 12.220 Yes ( 0.30000 0.70000 ) *
## 15) PriceDiff > -0.39 173 52.130 Yes ( 0.03468 0.96532 ) *
# prediction on the validation set
pred1 <- predict(tree1, val, type = "class")
table(pred1, val$Purchase)
##
## pred1 No Yes
## No 71 22
## Yes 35 140
# accuracy = (71+140) / 268 = 78.7
(for ISLR): next we consider whether pruning the tree might lead to improved results. The function cv.tree performs cross validation in order to cv.tree to determine the optimal level of tree complexity; cost complexity pruning is used in order to select a sequence of trees for consideration. We use the argument FUN= prune.misclass in order to indicate that we want the classification error rate to guide the cross validation and pruning process, rather than the default cv.tree function which is deviance
cv.tree2 <- cv.tree(tree1, FUN= prune.misclass)
cv.tree2$size
## [1] 8 4 2 1
cv.tree2$dev # 4 looks the best
## [1] 114 114 120 205
cv.tree2$k # related to alpha (see page 309). tuning parameters similar to the lasso
## [1] -Inf 0.0 10.5 93.0
par(mfrow= c(1,2))
plot(cv.tree2$size, cv.tree2$dev, type= "b")
plot(cv.tree2$k, cv.tree2$dev, type= "b")
# prune the tree
prune.tree2 <- prune.misclass(tree1, best = 4)
plot(prune.tree2)
text(prune.tree2, pretty= 0)
# using the prune tree on the validation set
pred2 <- predict(prune.tree2, val, type = "class")
table(pred2, val$Purchase)
##
## pred2 No Yes
## No 71 22
## Yes 35 140
# (71 + 140) / 268 = 78.7%
Althpugh we didn’t gain in accuracy, we gained in interpretability
set.seed(1)
bag.train1 <- randomForest(Purchase ~., data = train, mtry= 9,
importance= TRUE)
# select all predictors to do bagging
print(bag.train1)
##
## Call:
## randomForest(formula = Purchase ~ ., data = train, mtry = 9, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 9
##
## OOB estimate of error rate: 20.56%
## Confusion matrix:
## No Yes class.error
## No 151 54 0.2634146
## Yes 56 274 0.1696970
bag.pred1 <- predict(bag.train1, val)
table(bag.pred1, val$Purchase)
##
## bag.pred1 No Yes
## No 73 23
## Yes 33 139
# 73 + 139 / 268 = 79.1 accuracy
# we can vhange the number of trees grown with randomForest argument ntree = 25
Growing a random forest proceeds in exactly the same way, except that we use a smaller value of the mtry argument. By default, RandomForest use p/3 variables when building a random forest of regression trees and sqrt() variables when building a random forest of classification trees
# here sqrt(9)
set.seed(1)
rand.train2 <- randomForest(Purchase ~., data = train, mtry= 3,
importance= TRUE)
print(rand.train2)
##
## Call:
## randomForest(formula = Purchase ~ ., data = train, mtry = 3, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 19.81%
## Confusion matrix:
## No Yes class.error
## No 150 55 0.2682927
## Yes 51 279 0.1545455
importance(rand.train2) # to see which variables are important
## No Yes MeanDecreaseAccuracy MeanDecreaseGini
## WeekofPurchase 9.350784 8.6917020 13.684309 27.119242
## StoreID 10.822314 7.3888388 13.359602 14.757283
## PriceCH 9.025605 7.1130254 12.264294 10.162798
## DiscCH 10.322708 4.9531725 10.524433 6.469431
## DiscMM 8.311410 6.2033011 11.616372 8.297335
## SpecialCH 2.089920 6.8334056 7.276237 3.439487
## SpecialMM 7.755475 6.1557189 10.575689 4.927230
## LoyalCH 76.971299 61.3864686 84.899978 120.208980
## PriceDiff 15.331096 16.8853914 24.278282 24.362819
## StoreID2 1.181229 0.5540973 1.384551 2.925352
# Two measures of variables are reported. The former is based upon the mean decrease of accuracy in predictions on the out of the bag samples when a given variable is excluded from the model. The latter is a measure of the total decrease in node impurity that results from splits over that variable, averaged over all trees. In the case of regression trees, the node impurity is measured by the training RSS, and for classification trees by the deviance
varImpPlot(rand.train2)
rand.pred2 <- predict(rand.train2, val)
table(rand.pred2, val$Purchase)
##
## rand.pred2 No Yes
## No 74 20
## Yes 32 142
# 74 +142 / 268 = 80.59
Boosting works in a similar way than bagging, except that the trees are grown sequentially: each tree is grown using information from previously grown trees. Boosting does not involve bootstrap sampling. Instead, each tree is fit on a modified version of the original dataset.
We run gbm() with the option distribution= “gaussian” if this is a regression problem. For binary classification we use “bernoulli”. The argument n.trees= 5000 indicates gthat we want 5000 trees, and the option interaction.depth = 4 limits the depth of each tree.
train$StoreID <- as.factor(train$StoreID)
train$Purchase <- ifelse(train$Purchase == "Yes",1,0)
boost.train3 <- gbm(Purchase~., data = train, distribution = "bernoulli",
n.trees = 1000, interaction.depth = 4,
shrinkage = 0.01)
summary(boost.train3)
## var rel.inf
## LoyalCH LoyalCH 58.6637795
## PriceDiff PriceDiff 14.5297947
## WeekofPurchase WeekofPurchase 11.2361395
## StoreID StoreID 7.1418180
## PriceCH PriceCH 2.3124443
## DiscMM DiscMM 1.8817458
## SpecialMM SpecialMM 1.8392366
## DiscCH DiscCH 1.3688068
## SpecialCH SpecialCH 0.8919662
## StoreID2 StoreID2 0.1342686
# we can also produce partial dependence plots for these 2 variables. These plots illustrate the marginal effect of the selected variables on the response after integrating out the other variables
par(mfrow= c(1,2))
plot(boost.train3, i= "LoyalCH")
plot(boost.train3, i= "PriceDiff")
# prediction
val$Purchase <- ifelse(val$Purchase == "Yes",1,0)
val$StoreID <- as.factor(val$StoreID)
boost.pred3 <- predict(boost.train3, val, n.trees = 1000, type = "response")
boost.pred3 <- ifelse(boost.pred3 > 0.5, 1 , 0)
table(boost.pred3, val$Purchase)
##
## boost.pred3 0 1
## 0 70 20
## 1 36 142
# 70 + 142 / 268 = 79.1
# test set
test$Purchase <- ifelse(test$Purchase == "Yes",1,0)
test$StoreID <- as.factor(test$StoreID)
boost.pred4 <- predict(boost.train3, test, n.trees = 1000, type = "response")
boost.pred4 <- ifelse(boost.pred4 > 0.5, 1 , 0)
table(boost.pred4, test$Purchase)
##
## boost.pred4 0 1
## 0 83 24
## 1 23 137
# 83+137 / 267 = 82.2