Book case study

Methodology

choice: whether the customer purchased the art history of Florence. 1- represents a purchase 0- representa a nonpurchase

Gender: 0 = Female , 1 = Male Amount_Purchased - Total money spent on BBBC books

Frequency: Total number of purchases in the chosen period

Last_Purchase: Months since last purchase

First_Purchase: Months since first purchase

P_Child: Number of children’s books purchased

P_Youth: Number of youth books purchased

P_Cook: Number of cookbooks purchased

P_DIY: Number of do-it-yourself books purchased

P_Art: Number of art books purchased

Loading libraries

library(readxl)
library(e1071)
library(caret)
library(ROCR)
library(ggplot2)
library(tidyverse)
library(MASS)
library(ggThemeAssist)
library(esquisse)
library(gridExtra)
library(trackdown)
library(corrplot)
source('/Users/thomasfarrell/Downloads/optim_threshold.R')

Load in data

bbtrain = read_excel("/Users/thomasfarrell/Downloads/BBBC-Train.xlsx")
bbtest = read_excel("/Users/thomasfarrell/Downloads/BBBC-Test.xlsx")

Checking for any missing values

colSums(is.na(bbtest))

##      Observation           Choice           Gender Amount_purchased 
##                0                0                0                0 
##        Frequency    Last_purchase   First_purchase          P_Child 
##                0                0                0                0 
##          P_Youth           P_Cook            P_DIY            P_Art 
##                0                0                0                0

Remove first column is data

We remove the first variable in the data due to the fact that this column just held a placeholder number.

bbtrain = bbtrain[-c(1)]
bbtest = bbtest[-c(1)]

Convert the response variable to a factor

bbtrain$Choice = as.factor(bbtrain$Choice)
bbtest$Choice = as.factor(bbtest$Choice)

Checking correlation within X-Variables

b1_num = dplyr::select_if(bbtrain, is.numeric)
M = cor(b1_num)
corrplot(M, method = "number")

cor(bbtrain[sapply(bbtrain, is.numeric)])

##                        Gender Amount_purchased     Frequency Last_purchase
## Gender            1.000000000      -0.03060700  0.0321704951   -0.02896341
## Amount_purchased -0.030607000       1.00000000  0.0136664846    0.44070127
## Frequency         0.032170495       0.01366648  1.0000000000   -0.04194328
## Last_purchase    -0.028963412       0.44070127 -0.0419432803    1.00000000
## First_purchase    0.001026138       0.37481393  0.4459457457    0.81467469
## P_Child          -0.041475936       0.29931372 -0.0433279437    0.67913392
## P_Youth          -0.014130306       0.18755727 -0.0095854745    0.45325891
## P_Cook           -0.026673876       0.30425340  0.0004968833    0.67250539
## P_DIY            -0.025946174       0.22331539 -0.0089634125    0.55816739
## P_Art            -0.003500037       0.27248948 -0.0613754066    0.53433415
##                  First_purchase     P_Child      P_Youth        P_Cook
## Gender              0.001026138 -0.04147594 -0.014130306 -0.0266738763
## Amount_purchased    0.374813928  0.29931372  0.187557270  0.3042533969
## Frequency           0.445945746 -0.04332794 -0.009585474  0.0004968833
## Last_purchase       0.814674687  0.67913392  0.453258910  0.6725053933
## First_purchase      1.000000000  0.54482083  0.367892128  0.5710547918
## P_Child             0.544820825  1.00000000  0.174826719  0.2947065185
## P_Youth             0.367892128  0.17482672  1.000000000  0.1816566401
## P_Cook              0.571054792  0.29470652  0.181656640  1.0000000000
## P_DIY               0.462018843  0.25383708  0.188683456  0.2717251256
## P_Art               0.442082061  0.22451285  0.141751220  0.1916807611
##                         P_DIY        P_Art
## Gender           -0.025946174 -0.003500037
## Amount_purchased  0.223315392  0.272489483
## Frequency        -0.008963412 -0.061375407
## Last_purchase     0.558167395  0.534334145
## First_purchase    0.462018843  0.442082061
## P_Child           0.253837077  0.224512850
## P_Youth           0.188683456  0.141751220
## P_Cook            0.271725126  0.191680761
## P_DIY             1.000000000  0.207791065
## P_Art             0.207791065  1.000000000

Highest correlations are between First_purchase and Last_purchase, which makes sense in a way. An only customer makes a purchase and walks out, that may be both a First purchase and Last purchase observation. Repeat customers will have a longer time in months for their first First purchase than their Last purchase.

p1 = ggplot(data = bbtrain, mapping = aes(x = Frequency, fill = Choice))+
  geom_histogram() + theme(plot.title = element_text(face = "italic"),
    panel.background = element_rect(fill = "gray90",
        colour = "antiquewhite1", linetype = "dotted"),
    plot.background = element_rect(fill = "white",
        linetype = "dashed"))

p2 = ggplot(bbtrain) +
  aes(x = Last_purchase, fill = Choice) +
  geom_histogram(bins = 30L) +
  scale_fill_hue(direction = 1) +
  theme_minimal()

p3 = ggplot(bbtrain) +
  aes(x = P_Child, fill = Choice) +
  geom_histogram(bins = 30L) +
  scale_fill_hue(direction = 1) +
  theme_minimal()

p4 = ggplot(bbtrain) +
  aes(x = Amount_purchased, fill = Choice) +
  geom_histogram(bins = 30L) +
  scale_fill_hue(direction = 1) +
  theme_minimal()

grid.arrange(p1,p2,p3,p4)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Exploratory analysis

par(mfrow = c(3,3))
hist(bbtrain$Amount_purchased, xlab = "Amount Purchased", main = "Amount Purchased")
hist(bbtrain$Frequency, xlab = "Total Num. Purchased", main = "Total Purchased")
hist(bbtrain$Last_purchase, xlab = "Months Since Last Purchase", main = "Last Purchase")
hist(bbtrain$First_purchase, xlab = "Months Since First Purchase", main = "First Purchase")
hist(bbtrain$P_Child, xlab = "No. Children's Books Purchased", main = "Children's Books")
hist(bbtrain$P_Youth, xlab = "No. Youth Books Purchased", main = "Youth Books")
hist(bbtrain$P_Cook, xlab = "No. Cook Books Purchased", main = "Cook Books")
hist(bbtrain$P_DIY, xlab = "No. DIY Books Purchased", main = "DIY Books")
hist(bbtrain$P_Art, xlab = "No. Art Books Purchased", main = "Art Books")

From these charts we can see that the Amount Purchased variable is the only variable that looks normally distributed and they other variables appear to have a right skew in their results.

Results and findings

Linear model

par(mfrow=c(2,2))
mod_1 = lm(as.numeric(Choice) ~., data = bbtrain)
plot(mod_1)

summary(mod_1)

## 
## Call:
## lm(formula = as.numeric(Choice) ~ ., data = bbtrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9603 -0.2462 -0.1161  0.1622  1.0588 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1.3642284  0.0307411  44.378  < 2e-16 ***
## Gender           -0.1309205  0.0200303  -6.536 8.48e-11 ***
## Amount_purchased  0.0002736  0.0001110   2.464   0.0138 *  
## Frequency        -0.0090868  0.0021791  -4.170 3.21e-05 ***
## Last_purchase     0.0970286  0.0135589   7.156 1.26e-12 ***
## First_purchase   -0.0020024  0.0018160  -1.103   0.2704    
## P_Child          -0.1262584  0.0164011  -7.698 2.41e-14 ***
## P_Youth          -0.0963563  0.0201097  -4.792 1.81e-06 ***
## P_Cook           -0.1414907  0.0166064  -8.520  < 2e-16 ***
## P_DIY            -0.1352313  0.0197873  -6.834 1.17e-11 ***
## P_Art             0.1178494  0.0194427   6.061 1.68e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3788 on 1589 degrees of freedom
## Multiple R-squared:  0.2401, Adjusted R-squared:  0.2353 
## F-statistic:  50.2 on 10 and 1589 DF,  p-value: < 2.2e-16

#VIF for linear model

car::vif(mod_1)

##           Gender Amount_purchased        Frequency    Last_purchase 
##         1.005801         1.248066         3.253860        18.770402 
##   First_purchase          P_Child          P_Youth           P_Cook 
##         9.685333         3.360349         1.775022         3.324928 
##            P_DIY            P_Art 
##         2.016910         2.273771

Last_Purchase and First_Purchase have a GVIF over 5 which tells us that we must remove these variables due to multiculinarity.

bbtrain = dplyr::select(bbtrain, - Last_purchase)
bbtrain = dplyr::select(bbtrain, - First_purchase)

mod_1 = lm(as.numeric(Choice) ~., data = bbtrain)
car::vif(mod_1)

##           Gender Amount_purchased        Frequency          P_Child 
##         1.003526         1.232595         1.007587         1.212223 
##          P_Youth           P_Cook            P_DIY            P_Art 
##         1.083475         1.214043         1.163794         1.138879

Our new model has improved as all our variables are under 5 GVIF.

Logit model

glm.fit = glm(Choice ~ ., data = bbtrain, family = binomial)
summary(glm.fit)

## 
## Call:
## glm(formula = Choice ~ ., family = binomial, data = bbtrain)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.31846  -0.69097  -0.47171  -0.02488   2.84182  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.286380   0.202966  -1.411  0.15825    
## Gender           -0.811948   0.134579  -6.033 1.61e-09 ***
## Amount_purchased  0.002406   0.000771   3.120  0.00181 ** 
## Frequency        -0.088625   0.010385  -8.534  < 2e-16 ***
## P_Child          -0.194796   0.072207  -2.698  0.00698 ** 
## P_Youth          -0.031928   0.109605  -0.291  0.77082    
## P_Cook           -0.292392   0.072998  -4.005 6.19e-05 ***
## P_DIY            -0.279282   0.108094  -2.584  0.00977 ** 
## P_Art             1.245842   0.099062  12.576  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1799.5  on 1599  degrees of freedom
## Residual deviance: 1445.0  on 1591  degrees of freedom
## AIC: 1463
## 
## Number of Fisher Scoring iterations: 5

finding the optimal threshold

optim_threshold(glm.fit,bbtest, bbtest$Choice)

predprob = predict.glm(glm.fit, newdata = bbtest, type = "response")
predict.glm = ifelse(predprob >= .23, 1, 0)
caret::confusionMatrix(as.factor(predict.glm), as.factor(bbtest$Choice), positive = '1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1490   57
##          1  606  147
##                                           
##                Accuracy : 0.7117          
##                  95% CI : (0.6927, 0.7302)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1948          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.72059         
##             Specificity : 0.71088         
##          Pos Pred Value : 0.19522         
##          Neg Pred Value : 0.96315         
##              Prevalence : 0.08870         
##          Detection Rate : 0.06391         
##    Detection Prevalence : 0.32739         
##       Balanced Accuracy : 0.71573         
##                                           
##        'Positive' Class : 1               
##

Tuning the SVM Model

set.seed(1)
tuned = tune.svm(Choice ~ ., data = bbtrain, kernel = 'linear',gamma = seq(.01,.1,by = .025), cost = seq(.1,1.2, by = .1), scale = TRUE)
tuned$best.parameters

##    gamma cost
## 13  0.01  0.4

#creating SVM using tuned parameters

svm1 = svm(Choice ~ ., data = bbtrain, kernel = 'linear', gamma = tuned$best.parameters$gamma, cost = tuned$best.parameters$cost)

#Make predictions on training data set

predSVM = predict(svm1, bbtrain)
caret::confusionMatrix(predSVM, bbtrain$Choice, positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1153  286
##          1   47  114
##                                           
##                Accuracy : 0.7919          
##                  95% CI : (0.7711, 0.8115)
##     No Information Rate : 0.75            
##     P-Value [Acc > NIR] : 4.577e-05       
##                                           
##                   Kappa : 0.307           
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.28500         
##             Specificity : 0.96083         
##          Pos Pred Value : 0.70807         
##          Neg Pred Value : 0.80125         
##              Prevalence : 0.25000         
##          Detection Rate : 0.07125         
##    Detection Prevalence : 0.10063         
##       Balanced Accuracy : 0.62292         
##                                           
##        'Positive' Class : 1               
##

The accuracy of our tuned model is 79.19% with a sensitivity of .285 and specificity of .96.

#make prediticions on the testing data set

predSVM = predict(svm1, bbtest)
caret::confusionMatrix(predSVM, bbtest$Choice, positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2016  147
##          1   80   57
##                                           
##                Accuracy : 0.9013          
##                  95% CI : (0.8884, 0.9132)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 0.9559          
##                                           
##                   Kappa : 0.2832          
##                                           
##  Mcnemar's Test P-Value : 1.184e-05       
##                                           
##             Sensitivity : 0.27941         
##             Specificity : 0.96183         
##          Pos Pred Value : 0.41606         
##          Neg Pred Value : 0.93204         
##              Prevalence : 0.08870         
##          Detection Rate : 0.02478         
##    Detection Prevalence : 0.05957         
##       Balanced Accuracy : 0.62062         
##                                           
##        'Positive' Class : 1               
##

With our tuned model against the testing data our model achieves an accuracy of 90.13% and a sensitivity of .279 and specificity of .961.