OJ: logistic regression and tree based decision

Introduction
Logistic regression
Tree based analysis

Introduction

This a test of logistic regression and tree analysis on the OJ data, available in the ISLR library. Some of the comment are directly taken from the ISLR book.

The data contains 1070 purchases where the customer either purchased Citrus Hill or Minute Maid Orange Juice. A number of characteristics of the customer and product are recorded.

Logistic regression

ds <- ISLR::OJ

pairs(ds[, -c(1,2,3,8,9,10,14,18)],
      col= ds$Purchase,
      pch= 21,
      cex= 1,
      panel = panel.smooth)

boxplot(data= ds, PriceDiff ~ Purchase, main= "Price difference by purchase")

# Purchase by store
table(ds$Purchase, ds$StoreID)

##     
##        1   2   3   4   7
##   CH  85 107  75 112 274
##   MM  72 115 121  27  82

# week of purchase
eda_temp <- as.data.frame(table(ds$Purchase, ds$WeekofPurchase))
colnames(eda_temp) <- c("Purchase", "Week", "Freq")

ggplot(eda_temp, aes(x= Week, y= Freq, colour= Purchase, group= Purchase))+
  geom_point()+
  geom_line()+
  theme_bw()+
  theme(legend.position = "top", axis.text.x = element_text(angle = 90, vjust = 0.5))+
  ggtitle("Number of purchase by week")

rm(eda_temp)

boxplot(data= ds, LoyalCH ~ Purchase, main= "Custom brand loyalty for CH by purchase")

Clean

ds$StoreID <- as.character(ds$StoreID)
ds$STORE <- as.character(ds$STORE)
ds$Purchase <- ifelse(ds$Purchase == "CH", 1, 0)

# removing columns which are duplicates or will bring singularities issues
ds <- ds[, -c(5, 11, 12, 14, 15, 16, 17, 18)]

Simple algorithm

# split test, validation and train
set.seed(19)
ds <- ds[sample(1:nrow(ds)), ]
train <- ds[1:535,]
val <- ds[536:803,]
test <- ds[804:1070,]

# simple algo
glm1 <- glm(family = "binomial", Purchase ~., data = train)

pred1 <- predict(glm1, val[,-1], type = "response")
result1 <- ifelse(pred1 > 0.5, 1, 0)

table(result1, val$Purchase)

##        
## result1   0   1
##       0  73  18
##       1  33 144

# accuracy = (73 + 144) / 268 = 80.9% 

# reg subset
reg.summary <- summary(regsubsets(Purchase ~ ., train, nvmax = 9))

print(reg.summary)

## Subset selection object
## Call: regsubsets.formula(Purchase ~ ., train, nvmax = 9)
## 12 Variables  (and intercept)
##                Forced in Forced out
## WeekofPurchase     FALSE      FALSE
## StoreID2           FALSE      FALSE
## StoreID3           FALSE      FALSE
## StoreID4           FALSE      FALSE
## StoreID7           FALSE      FALSE
## PriceCH            FALSE      FALSE
## DiscCH             FALSE      FALSE
## DiscMM             FALSE      FALSE
## SpecialCH          FALSE      FALSE
## SpecialMM          FALSE      FALSE
## LoyalCH            FALSE      FALSE
## PriceDiff          FALSE      FALSE
## 1 subsets of each size up to 9
## Selection Algorithm: exhaustive
##          WeekofPurchase StoreID2 StoreID3 StoreID4 StoreID7 PriceCH DiscCH
## 1  ( 1 ) " "            " "      " "      " "      " "      " "     " "   
## 2  ( 1 ) " "            " "      " "      " "      " "      " "     " "   
## 3  ( 1 ) " "            "*"      " "      " "      " "      " "     " "   
## 4  ( 1 ) " "            " "      " "      " "      "*"      " "     " "   
## 5  ( 1 ) " "            "*"      "*"      " "      " "      " "     " "   
## 6  ( 1 ) " "            "*"      "*"      "*"      " "      " "     " "   
## 7  ( 1 ) " "            "*"      "*"      "*"      " "      "*"     " "   
## 8  ( 1 ) "*"            "*"      "*"      "*"      " "      " "     " "   
## 9  ( 1 ) "*"            "*"      "*"      "*"      " "      " "     " "   
##          DiscMM SpecialCH SpecialMM LoyalCH PriceDiff
## 1  ( 1 ) " "    " "       " "       "*"     " "      
## 2  ( 1 ) " "    " "       " "       "*"     "*"      
## 3  ( 1 ) " "    " "       " "       "*"     "*"      
## 4  ( 1 ) " "    " "       "*"       "*"     "*"      
## 5  ( 1 ) " "    " "       "*"       "*"     "*"      
## 6  ( 1 ) " "    " "       "*"       "*"     "*"      
## 7  ( 1 ) " "    " "       "*"       "*"     "*"      
## 8  ( 1 ) "*"    " "       "*"       "*"     "*"      
## 9  ( 1 ) "*"    "*"       "*"       "*"     "*"

# choosing the best number of variables
par(mfrow = c(2,2))
plot(reg.summary$rss, xlab = "number of variable", ylab = "RSS",
     type = "l")

plot(reg.summary$adjr2, xlab = "number of variable", ylab = "Adjusted RSq",
     type = "l")
points(6, reg.summary$adjr2[6], col= "red", cex= 2, pch= 20)

plot(reg.summary$cp, xlab = "number of variable", ylab = "Cp",
     type = "l")
points(3, reg.summary$cp[3], col= "red", cex= 2, pch= 20)

plot(reg.summary$bic, xlab = "number of variable", ylab = "Bic",
     type = "l")
points(2, reg.summary$bic[2], col= "red", cex= 2, pch= 20)

# we will choose 3 variables and calculate the accuracy. As one of the chosen variable is storeID2, we need to create a new column

train$StoreID2 <- ifelse(train$StoreID == "2", 1,0)
val$StoreID2 <- ifelse(val$StoreID == "2", 1,0)  
test$StoreID2 <- ifelse(test$StoreID == "2", 1,0)  

glm2 <- glm(family = "binomial", Purchase ~ LoyalCH + PriceDiff + StoreID2, data = train)

print(summary(glm2))

## 
## Call:
## glm(formula = Purchase ~ LoyalCH + PriceDiff + StoreID2, family = "binomial", 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8268  -0.5590   0.2556   0.5817   2.6608  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.0495     0.3261  -9.352  < 2e-16 ***
## LoyalCH       6.3055     0.5455  11.560  < 2e-16 ***
## PriceDiff     2.9471     0.4808   6.130  8.8e-10 ***
## StoreID2     -0.4513     0.2755  -1.638    0.101    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 712.19  on 534  degrees of freedom
## Residual deviance: 425.57  on 531  degrees of freedom
## AIC: 433.57
## 
## Number of Fisher Scoring iterations: 5

pred2 <- predict(glm2, type = "response", val[,-1])
result2 <- ifelse(pred2 > 0.5, 1, 0)

table(result2, val$Purchase)

##        
## result2   0   1
##       0  73  19
##       1  33 143

# 73 + 143 / 268 = 80.5%

Threshold

error_learning <- rep(0, length= 100)

for (i in 1:100) {
  result <- ifelse(pred2 >= i * 0.01,1,0)
  result <- sum((result - val$Purchase)^2)/321
  error_learning[i] <- result
}

par(mfrow= c(1,1))
plot(x= seq(0.01,1,by= 0.01), y= error_learning, xlab = "Threshold",
     main = "Error rate by threshold")

which.min(error_learning)

## [1] 66

points(66, error_learning[66], col= "red", cex= 2, pch= 20)

# new threshold 0.66


result2 <- ifelse(pred2 > 0.66,1,0)
table(result2, val$Purchase) # 82.4

##        
## result2   0   1
##       0  87  28
##       1  19 134

Learning curve

error_val <- rep(0, dim(train)[1]-10)
error_train <- rep(0, dim(train)[1]-10)

for (i in 1:535) {
  glm3 <- glm(family = "binomial", data = train[1:i, ],
              Purchase ~ LoyalCH + PriceDiff + StoreID2)
  
  predict1 <- predict(glm3, train[1:i, -1], type= "response")
  result <- ifelse(predict1 > 0.66,1,0)
  error_train[i-9] <- 1 - sum((result - train$Purchase[1:i])^2)/i
  
  predict1 <- predict(glm3, val[, -1], type= "response")
  result <- ifelse(predict1 > 0.66,1,0)
  error_val[i-9] <- 1 - sum((result - val$Purchase)^2)/268 
  
  
}

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

learning_curve <- data.frame(i = 10:535, train = error_train, val= error_val)
learning_curve <- gather(learning_curve, category, value, -i)

ggplot(learning_curve, aes(x= i, y= value, colour= category, group= category))+
  geom_line(size= 1.2)+
  theme_bw()+
  ggtitle("Learning curve")+
  xlab("number of training examples")

Test set

glm3 <- glm(family = "binomial", data = train,
              Purchase ~ LoyalCH + PriceDiff + StoreID2)

predict1 <- predict(glm3, test[, -1], type = "response")
result <- ifelse(predict1 > 0.66, 1,0)

1 - sum((result - test$Purchase)^2)/267 # 81.27

## [1] 0.8127341

Tree based analysis

train$Purchase <- ifelse(train$Purchase == 1, "Yes", "No") # required in order to have a classification tree and not a regression tree
train$Purchase <- as.factor(train$Purchase)

val$Purchase <- ifelse(val$Purchase == 1, "Yes", "No")
val$Purchase <- as.factor(val$Purchase)

test$Purchase <- ifelse(test$Purchase == 1, "Yes", "No")
test$Purchase <- as.factor(test$Purchase)


# tree function
tree1 <- tree(Purchase ~., train)
summary(tree1)

## 
## Classification tree:
## tree(formula = Purchase ~ ., data = train)
## Variables actually used in tree construction:
## [1] "LoyalCH"   "PriceDiff"
## Number of terminal nodes:  8 
## Residual mean deviance:  0.7764 = 409.2 / 527 
## Misclassification error rate: 0.1701 = 91 / 535

table(tree1$y, train$Purchase)

##      
##        No Yes
##   No  205   0
##   Yes   0 330

plot(tree1)
text(tree1, pretty = 0) # indicates to add category names

print(tree1) # show each branch of the tree with the split criterion, the number of observations in that branch, the deviance, the overall prediction for the branch  and the fraction of observations in that branch that take on values of Yes and No. Branches that lead to terminal nodes are indicated with asterix

## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 535 712.200 Yes ( 0.38318 0.61682 )  
##    2) LoyalCH < 0.50395 235 287.900 No ( 0.69787 0.30213 )  
##      4) LoyalCH < 0.382815 143 132.500 No ( 0.82517 0.17483 )  
##        8) LoyalCH < 0.0320865 38   9.249 No ( 0.97368 0.02632 ) *
##        9) LoyalCH > 0.0320865 105 112.900 No ( 0.77143 0.22857 ) *
##      5) LoyalCH > 0.382815 92 127.500 Yes ( 0.50000 0.50000 )  
##       10) PriceDiff < 0.235 45  52.190 No ( 0.73333 0.26667 ) *
##       11) PriceDiff > 0.235 47  55.430 Yes ( 0.27660 0.72340 ) *
##    3) LoyalCH > 0.50395 300 239.300 Yes ( 0.13667 0.86333 )  
##      6) LoyalCH < 0.740621 117 137.300 Yes ( 0.27350 0.72650 )  
##       12) PriceDiff < 0.25 63  86.560 Yes ( 0.44444 0.55556 ) *
##       13) PriceDiff > 0.25 54  28.520 Yes ( 0.07407 0.92593 ) *
##      7) LoyalCH > 0.740621 183  71.770 Yes ( 0.04918 0.95082 )  
##       14) PriceDiff < -0.39 10  12.220 Yes ( 0.30000 0.70000 ) *
##       15) PriceDiff > -0.39 173  52.130 Yes ( 0.03468 0.96532 ) *

# prediction on the validation set
pred1 <- predict(tree1, val, type = "class")
table(pred1, val$Purchase)

##      
## pred1  No Yes
##   No   71  22
##   Yes  35 140

# accuracy = (71+140) / 268 = 78.7

Pruning the tree

(for ISLR): next we consider whether pruning the tree might lead to improved results. The function cv.tree performs cross validation in order to cv.tree to determine the optimal level of tree complexity; cost complexity pruning is used in order to select a sequence of trees for consideration. We use the argument FUN= prune.misclass in order to indicate that we want the classification error rate to guide the cross validation and pruning process, rather than the default cv.tree function which is deviance

cv.tree2 <- cv.tree(tree1, FUN= prune.misclass)
cv.tree2$size

## [1] 8 4 2 1

cv.tree2$dev # 4 looks the best

## [1] 114 114 120 205

cv.tree2$k # related to alpha (see page 309). tuning parameters similar to the lasso

## [1] -Inf  0.0 10.5 93.0

par(mfrow= c(1,2))
plot(cv.tree2$size, cv.tree2$dev, type= "b")
plot(cv.tree2$k, cv.tree2$dev, type= "b")

# prune the tree
prune.tree2 <- prune.misclass(tree1, best = 4)
plot(prune.tree2)
text(prune.tree2, pretty= 0)


# using the prune tree on the validation set
pred2 <- predict(prune.tree2, val, type = "class")
table(pred2, val$Purchase)

##      
## pred2  No Yes
##   No   71  22
##   Yes  35 140

# (71 + 140) / 268 = 78.7%

Althpugh we didn’t gain in accuracy, we gained in interpretability

Bagging

set.seed(1)
bag.train1 <- randomForest(Purchase ~., data = train, mtry= 9,
                          importance= TRUE)

# select all predictors to do bagging
print(bag.train1)

## 
## Call:
##  randomForest(formula = Purchase ~ ., data = train, mtry = 9,      importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 9
## 
##         OOB estimate of  error rate: 20.56%
## Confusion matrix:
##      No Yes class.error
## No  151  54   0.2634146
## Yes  56 274   0.1696970

bag.pred1 <- predict(bag.train1, val)
table(bag.pred1, val$Purchase)

##          
## bag.pred1  No Yes
##       No   73  23
##       Yes  33 139

# 73 + 139 / 268 = 79.1 accuracy

# we can vhange the number of trees grown with randomForest argument ntree = 25

Random Forest

Growing a random forest proceeds in exactly the same way, except that we use a smaller value of the mtry argument. By default, RandomForest use p/3 variables when building a random forest of regression trees and sqrt() variables when building a random forest of classification trees

# here sqrt(9) 

set.seed(1)
rand.train2 <- randomForest(Purchase ~., data = train, mtry= 3,
                          importance= TRUE)

print(rand.train2)

## 
## Call:
##  randomForest(formula = Purchase ~ ., data = train, mtry = 3,      importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 19.81%
## Confusion matrix:
##      No Yes class.error
## No  150  55   0.2682927
## Yes  51 279   0.1545455

importance(rand.train2) # to see which variables are important

##                       No        Yes MeanDecreaseAccuracy MeanDecreaseGini
## WeekofPurchase  9.350784  8.6917020            13.684309        27.119242
## StoreID        10.822314  7.3888388            13.359602        14.757283
## PriceCH         9.025605  7.1130254            12.264294        10.162798
## DiscCH         10.322708  4.9531725            10.524433         6.469431
## DiscMM          8.311410  6.2033011            11.616372         8.297335
## SpecialCH       2.089920  6.8334056             7.276237         3.439487
## SpecialMM       7.755475  6.1557189            10.575689         4.927230
## LoyalCH        76.971299 61.3864686            84.899978       120.208980
## PriceDiff      15.331096 16.8853914            24.278282        24.362819
## StoreID2        1.181229  0.5540973             1.384551         2.925352

# Two measures of variables are reported. The former is based upon the mean decrease of accuracy in predictions on the out of the bag samples when a given variable is excluded from the model. The latter is a measure of the total decrease in node impurity that results from splits over that variable, averaged over all trees. In the case of regression trees, the node impurity is measured by the training RSS, and for classification trees by the deviance

varImpPlot(rand.train2)

rand.pred2 <- predict(rand.train2, val) 
table(rand.pred2, val$Purchase)

##           
## rand.pred2  No Yes
##        No   74  20
##        Yes  32 142

# 74 +142 / 268 = 80.59

Boosting

Boosting works in a similar way than bagging, except that the trees are grown sequentially: each tree is grown using information from previously grown trees. Boosting does not involve bootstrap sampling. Instead, each tree is fit on a modified version of the original dataset.

We run gbm() with the option distribution= “gaussian” if this is a regression problem. For binary classification we use “bernoulli”. The argument n.trees= 5000 indicates gthat we want 5000 trees, and the option interaction.depth = 4 limits the depth of each tree.

train$StoreID <- as.factor(train$StoreID)
train$Purchase <- ifelse(train$Purchase == "Yes",1,0)

boost.train3 <- gbm(Purchase~., data = train, distribution = "bernoulli",
                    n.trees = 1000, interaction.depth = 4,
                    shrinkage = 0.01)



summary(boost.train3)

##                           var    rel.inf
## LoyalCH               LoyalCH 58.6637795
## PriceDiff           PriceDiff 14.5297947
## WeekofPurchase WeekofPurchase 11.2361395
## StoreID               StoreID  7.1418180
## PriceCH               PriceCH  2.3124443
## DiscMM                 DiscMM  1.8817458
## SpecialMM           SpecialMM  1.8392366
## DiscCH                 DiscCH  1.3688068
## SpecialCH           SpecialCH  0.8919662
## StoreID2             StoreID2  0.1342686

# we can also produce partial dependence plots for these 2 variables. These plots illustrate the marginal effect of the selected variables on the response after integrating out the other variables

par(mfrow= c(1,2))
plot(boost.train3, i= "LoyalCH")
plot(boost.train3, i= "PriceDiff")

# prediction
val$Purchase <- ifelse(val$Purchase == "Yes",1,0)
val$StoreID <- as.factor(val$StoreID)

boost.pred3 <- predict(boost.train3, val, n.trees = 1000, type = "response")
boost.pred3 <- ifelse(boost.pred3 > 0.5, 1 , 0)
table(boost.pred3, val$Purchase)

##            
## boost.pred3   0   1
##           0  70  20
##           1  36 142

# 70 + 142 / 268 = 79.1


# test set
test$Purchase <- ifelse(test$Purchase == "Yes",1,0)
test$StoreID <- as.factor(test$StoreID)

boost.pred4 <- predict(boost.train3, test, n.trees = 1000, type = "response")
boost.pred4 <- ifelse(boost.pred4 > 0.5, 1 , 0)
table(boost.pred4, test$Purchase)

##            
## boost.pred4   0   1
##           0  83  24
##           1  23 137

# 83+137 / 267 = 82.2