Note

Below we have an LDA model and a Logistic Regression Model with nearly similar prediction accuracy at 89%. The LDA model is useful in that we can see the relationships of the variables in relation to the dependent variable ‘Choice’. I have also used the file ‘BBBC-Train.xlsx’ to Train the models, and ‘BBBC-Test.xlsx’ to Test the models. At no point did I split the data manually.

library(e1071)
library(caret) 
## Loading required package: lattice
## Loading required package: ggplot2
library(gam) 
## Warning: package 'gam' was built under R version 4.1.1
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.20
library(tidyr)
library(readxl)
## Warning: package 'readxl' was built under R version 4.1.1
library(MASS)
bbtrain = read_excel("BBBC-Train.xlsx")
bbtest = read_excel("BBBC-Test.xlsx")
str(bbtrain)
## tibble [1,600 x 12] (S3: tbl_df/tbl/data.frame)
##  $ Observation     : num [1:1600] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Choice          : num [1:1600] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Gender          : num [1:1600] 1 1 1 1 0 1 1 0 1 1 ...
##  $ Amount_purchased: num [1:1600] 113 418 336 180 320 268 198 280 393 138 ...
##  $ Frequency       : num [1:1600] 8 6 18 16 2 4 2 6 12 10 ...
##  $ Last_purchase   : num [1:1600] 1 11 6 5 3 1 12 2 11 7 ...
##  $ First_purchase  : num [1:1600] 8 66 32 42 18 4 62 12 50 38 ...
##  $ P_Child         : num [1:1600] 0 0 2 2 0 0 2 0 3 2 ...
##  $ P_Youth         : num [1:1600] 1 2 0 0 0 0 3 2 0 3 ...
##  $ P_Cook          : num [1:1600] 0 3 1 0 0 0 2 0 3 0 ...
##  $ P_DIY           : num [1:1600] 0 2 1 1 1 0 1 0 0 0 ...
##  $ P_Art           : num [1:1600] 0 3 2 1 2 0 2 0 2 1 ...
bbtrain = subset(bbtrain, select = -Observation)
bbtest = subset(bbtest, select = -Observation)
summary(bbtrain)
##      Choice         Gender       Amount_purchased   Frequency    
##  Min.   :0.00   Min.   :0.0000   Min.   : 15.0    Min.   : 2.00  
##  1st Qu.:0.00   1st Qu.:0.0000   1st Qu.:126.8    1st Qu.: 6.00  
##  Median :0.00   Median :1.0000   Median :203.0    Median :12.00  
##  Mean   :0.25   Mean   :0.6587   Mean   :200.9    Mean   :12.31  
##  3rd Qu.:0.25   3rd Qu.:1.0000   3rd Qu.:273.0    3rd Qu.:16.00  
##  Max.   :1.00   Max.   :1.0000   Max.   :474.0    Max.   :36.00  
##  Last_purchase    First_purchase     P_Child          P_Youth      
##  Min.   : 1.000   Min.   : 2.00   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 1.000   1st Qu.:12.00   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median : 2.000   Median :18.00   Median :0.0000   Median :0.0000  
##  Mean   : 3.199   Mean   :22.58   Mean   :0.7394   Mean   :0.3375  
##  3rd Qu.: 4.000   3rd Qu.:30.00   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :12.000   Max.   :96.00   Max.   :8.0000   Max.   :4.0000  
##      P_Cook         P_DIY            P_Art      
##  Min.   :0.00   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00   1st Qu.:0.0000   1st Qu.:0.000  
##  Median :0.00   Median :0.0000   Median :0.000  
##  Mean   :0.76   Mean   :0.3912   Mean   :0.425  
##  3rd Qu.:1.00   3rd Qu.:1.0000   3rd Qu.:1.000  
##  Max.   :6.00   Max.   :4.0000   Max.   :5.000
summary(bbtest)
##      Choice           Gender       Amount_purchased   Frequency   
##  Min.   :0.0000   Min.   :0.0000   Min.   : 15.0    Min.   : 2.0  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:119.0    1st Qu.: 8.0  
##  Median :0.0000   Median :1.0000   Median :198.0    Median :12.0  
##  Mean   :0.0887   Mean   :0.6865   Mean   :195.3    Mean   :13.3  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:268.0    3rd Qu.:16.0  
##  Max.   :1.0000   Max.   :1.0000   Max.   :461.0    Max.   :36.0  
##  Last_purchase    First_purchase     P_Child          P_Youth      
##  Min.   : 1.000   Min.   : 2.00   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 1.000   1st Qu.:12.00   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median : 2.000   Median :18.00   Median :0.0000   Median :0.0000  
##  Mean   : 3.059   Mean   :22.85   Mean   :0.7287   Mean   :0.3426  
##  3rd Qu.: 4.000   3rd Qu.:32.00   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :12.000   Max.   :96.00   Max.   :7.0000   Max.   :5.0000  
##      P_Cook           P_DIY            P_Art     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00  
##  Median :0.0000   Median :0.0000   Median :0.00  
##  Mean   :0.7857   Mean   :0.4061   Mean   :0.33  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.00  
##  Max.   :6.0000   Max.   :4.0000   Max.   :4.00
table(bbtrain$Choice)
## 
##    0    1 
## 1200  400
table(bbtest$Choice)
## 
##    0    1 
## 2096  204
anyNA(bbtrain)
## [1] FALSE
anyNA(bbtest)
## [1] FALSE

LDA

Change the variables Gender and Choice to factor

bbtrain$Gender = as.factor(bbtrain$Gender)
bbtrain$Choice = as.factor(bbtrain$Choice)
bbtest$Gender = as.factor(bbtest$Gender)
bbtest$Choice = as.factor(bbtest$Choice)
set.seed(123)
lda.model = lda(Choice ~ ., data = bbtrain)
lda.model
## Call:
## lda(Choice ~ ., data = bbtrain)
## 
## Prior probabilities of groups:
##    0    1 
## 0.75 0.25 
## 
## Group means:
##   Gender1 Amount_purchased Frequency Last_purchase First_purchase   P_Child
## 0  0.6975         194.4167  13.33667      2.951667       22.54667 0.7341667
## 1  0.5425         220.4125   9.24500      3.940000       22.66500 0.7550000
##   P_Youth    P_Cook     P_DIY     P_Art
## 0  0.3275 0.7841667 0.3933333 0.2733333
## 1  0.3675 0.6875000 0.3850000 0.8800000
## 
## Coefficients of linear discriminants:
##                           LD1
## Gender1          -0.707433464
## Amount_purchased  0.001478478
## Frequency        -0.049100848
## Last_purchase     0.524297499
## First_purchase   -0.010820025
## P_Child          -0.682241865
## P_Youth          -0.520664659
## P_Cook           -0.764550263
## P_DIY            -0.730726880
## P_Art             0.636803402

From the LDA model, we can see some of the relationships the variables have when considering the purchase of “The Art History of Florence”, by observing the coefficients of linear discriminates. We see that Gender1 (Male), are less likely than Females to buy this book. We also see that Last_Purchase has a positive value, meaning that when the months since Last Purchase goes up, then the customer is more likely to buy this book. A customer who’s Last Purchase was 10 months ago, is more likely to buy than a customer who’s Last Purchase was 2 months ago. And customers who increasingly but Art books (P_Art) are likely to buy this book also.

creation of predictions

predictions.lda = predict(lda.model, bbtest)

confusion matrix

caret::confusionMatrix(predictions.lda$class, bbtest$Choice)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1968  127
##          1  128   77
##                                           
##                Accuracy : 0.8891          
##                  95% CI : (0.8756, 0.9017)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 0.9999          
##                                           
##                   Kappa : 0.3157          
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.9389          
##             Specificity : 0.3775          
##          Pos Pred Value : 0.9394          
##          Neg Pred Value : 0.3756          
##              Prevalence : 0.9113          
##          Detection Rate : 0.8557          
##    Detection Prevalence : 0.9109          
##       Balanced Accuracy : 0.6582          
##                                           
##        'Positive' Class : 0               
## 

88% accuracy

Logistic Model

set.seed(123)
logmodel = glm(Choice ~ ., data = bbtrain, family = binomial)
summary(logmodel)
## 
## Call:
## glm(formula = Choice ~ ., family = binomial, data = bbtrain)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.38586  -0.66728  -0.43696  -0.02242   2.72238  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.3515281  0.2143839  -1.640   0.1011    
## Gender1          -0.8632319  0.1374499  -6.280 3.38e-10 ***
## Amount_purchased  0.0018641  0.0007918   2.354   0.0186 *  
## Frequency        -0.0755142  0.0165937  -4.551 5.35e-06 ***
## Last_purchase     0.6117713  0.0938127   6.521 6.97e-11 ***
## First_purchase   -0.0147792  0.0128027  -1.154   0.2483    
## P_Child          -0.8112489  0.1167067  -6.951 3.62e-12 ***
## P_Youth          -0.6370422  0.1433778  -4.443 8.87e-06 ***
## P_Cook           -0.9230066  0.1194814  -7.725 1.12e-14 ***
## P_DIY            -0.9058697  0.1437025  -6.304 2.90e-10 ***
## P_Art             0.6861124  0.1270176   5.402 6.60e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1799.5  on 1599  degrees of freedom
## Residual deviance: 1392.2  on 1589  degrees of freedom
## AIC: 1414.2
## 
## Number of Fisher Scoring iterations: 5

create predictions for logistic regression model

predictions.log = predict(logmodel, newdata = bbtest, type = 'response')

Convert predicted probabilities into classes

predictions.log.surv = ifelse(predictions.log >= 0.5, 1, 0)

run confusion matrix for logistic regression

caret::confusionMatrix(as.factor(predictions.log.surv), bbtest$Choice)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1968  125
##          1  128   79
##                                           
##                Accuracy : 0.89            
##                  95% CI : (0.8765, 0.9025)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 0.9998          
##                                           
##                   Kappa : 0.324           
##                                           
##  Mcnemar's Test P-Value : 0.8999          
##                                           
##             Sensitivity : 0.9389          
##             Specificity : 0.3873          
##          Pos Pred Value : 0.9403          
##          Neg Pred Value : 0.3816          
##              Prevalence : 0.9113          
##          Detection Rate : 0.8557          
##    Detection Prevalence : 0.9100          
##       Balanced Accuracy : 0.6631          
##                                           
##        'Positive' Class : 0               
## 

89% accuracy, similar to LDA model. This model is less complex to use and can more easily be explained to stakeholders, therefore the Logistic Regression Model could be preferred over the LDA Model.