Case 2 - Bookbinders

library(readxl)
library(car)

## Loading required package: carData

library(olsrr)

## Warning: package 'olsrr' was built under R version 4.0.3

## 
## Attaching package: 'olsrr'

## The following object is masked from 'package:datasets':
## 
##     rivers

library(ResourceSelection)

## Warning: package 'ResourceSelection' was built under R version 4.0.3

## ResourceSelection 0.3-5   2019-07-22

library(ROCR)

## Warning: package 'ROCR' was built under R version 4.0.5

library(e1071)

BBBC.train <- read_excel("BBBC-Train.xlsx")
BBBC.test <- read_excel("BBBC-Test.xlsx")

Removing “Observation” variable and converting “Choice and Gender” to factors.

BBBC.train$Observation <- NULL
BBBC.test$Observation <- NULL
BBBC.train$Choice <- as.factor(BBBC.train$Choice)
BBBC.train$Gender <- as.factor(BBBC.train$Gender)
BBBC.test$Choice <- as.factor(BBBC.test$Choice)
BBBC.test$Gender <- as.factor(BBBC.test$Gender)

Create 70/30 split for GLM

set.seed(12345)
pseudo.index <- sample(1:nrow(BBBC.train), 0.7*nrow(BBBC.train))
BBBC.pseudo.train <- BBBC.train[pseudo.index, ]
BBBC.pseudo.test <- BBBC.train[-pseudo.index, ]
BBBC.pseudo.test.true <- BBBC.pseudo.test$Choice

Exploratory Analysis:

Relationships between variables:

pairs(BBBC.train)

Correlation table:

cor(BBBC.train[sapply(BBBC.train, is.numeric)])

##                  Amount_purchased     Frequency Last_purchase First_purchase
## Amount_purchased       1.00000000  0.0136664846    0.44070127      0.3748139
## Frequency              0.01366648  1.0000000000   -0.04194328      0.4459457
## Last_purchase          0.44070127 -0.0419432803    1.00000000      0.8146747
## First_purchase         0.37481393  0.4459457457    0.81467469      1.0000000
## P_Child                0.29931372 -0.0433279437    0.67913392      0.5448208
## P_Youth                0.18755727 -0.0095854745    0.45325891      0.3678921
## P_Cook                 0.30425340  0.0004968833    0.67250539      0.5710548
## P_DIY                  0.22331539 -0.0089634125    0.55816739      0.4620188
## P_Art                  0.27248948 -0.0613754066    0.53433415      0.4420821
##                      P_Child      P_Youth       P_Cook        P_DIY       P_Art
## Amount_purchased  0.29931372  0.187557270 0.3042533969  0.223315392  0.27248948
## Frequency        -0.04332794 -0.009585474 0.0004968833 -0.008963412 -0.06137541
## Last_purchase     0.67913392  0.453258910 0.6725053933  0.558167395  0.53433415
## First_purchase    0.54482083  0.367892128 0.5710547918  0.462018843  0.44208206
## P_Child           1.00000000  0.174826719 0.2947065185  0.253837077  0.22451285
## P_Youth           0.17482672  1.000000000 0.1816566401  0.188683456  0.14175122
## P_Cook            0.29470652  0.181656640 1.0000000000  0.271725126  0.19168076
## P_DIY             0.25383708  0.188683456 0.2717251256  1.000000000  0.20779106
## P_Art             0.22451285  0.141751220 0.1916807611  0.207791065  1.00000000

Highest correlations are between “First_purchase” and “Last_purchase”.

Boxplots for the non-categorical variables:

par(mfrow=c(3,3))
hist(BBBC.train$Amount_purchased, xlab = "Amount Purchased")
hist(BBBC.train$Frequency, xlab = "Total Num. Purchases in Study")
hist(BBBC.train$Last_purchase, xlab = "Months Since Last Purchase")
hist(BBBC.train$First_purchase, xlab = "Months Since First Purchase")
hist(BBBC.train$P_Child, xlab = "No. Children's Books Purchased")
hist(BBBC.train$P_Youth, xlab = "No. Youth Books Purchased")
hist(BBBC.train$P_Cook, xlab = "No. Cook Books Purchased")
hist(BBBC.train$P_DIY, xlab = "No. DIY Books Purchased")
hist(BBBC.train$P_Art, xlab = "No. Art Books Purchased")

“Amount Purchased” seems normally distributed, but all others have a heavy positive skew.

Linear Regression

lm1 <- lm(as.numeric(Choice) ~ ., data = BBBC.train)
summary(lm1)

## 
## Call:
## lm(formula = as.numeric(Choice) ~ ., data = BBBC.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9603 -0.2462 -0.1161  0.1622  1.0588 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1.3642284  0.0307411  44.378  < 2e-16 ***
## Gender1          -0.1309205  0.0200303  -6.536 8.48e-11 ***
## Amount_purchased  0.0002736  0.0001110   2.464   0.0138 *  
## Frequency        -0.0090868  0.0021791  -4.170 3.21e-05 ***
## Last_purchase     0.0970286  0.0135589   7.156 1.26e-12 ***
## First_purchase   -0.0020024  0.0018160  -1.103   0.2704    
## P_Child          -0.1262584  0.0164011  -7.698 2.41e-14 ***
## P_Youth          -0.0963563  0.0201097  -4.792 1.81e-06 ***
## P_Cook           -0.1414907  0.0166064  -8.520  < 2e-16 ***
## P_DIY            -0.1352313  0.0197873  -6.834 1.17e-11 ***
## P_Art             0.1178494  0.0194427   6.061 1.68e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3788 on 1589 degrees of freedom
## Multiple R-squared:  0.2401, Adjusted R-squared:  0.2353 
## F-statistic:  50.2 on 10 and 1589 DF,  p-value: < 2.2e-16

Check for multicollinearity:

vif(lm1)

##           Gender Amount_purchased        Frequency    Last_purchase 
##         1.005801         1.248066         3.253860        18.770402 
##   First_purchase          P_Child          P_Youth           P_Cook 
##         9.685333         3.360349         1.775022         3.324928 
##            P_DIY            P_Art 
##         2.016910         2.273771

remove “Last Purchase”

lm1.novif <- lm(as.numeric(Choice) ~ . - Last_purchase, BBBC.train)
vif(lm1.novif)

##           Gender Amount_purchased        Frequency   First_purchase 
##         1.005634         1.235982         2.651820         7.182666 
##          P_Child          P_Youth           P_Cook            P_DIY 
##         1.949849         1.307915         2.009609         1.457362 
##            P_Art 
##         1.634878

Check model for significant variables:

summary(lm1.novif)

## 
## Call:
## lm(formula = as.numeric(Choice) ~ . - Last_purchase, data = BBBC.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0018 -0.2482 -0.1277  0.1567  1.1035 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1.3926595  0.0309609  44.981  < 2e-16 ***
## Gender1          -0.1290720  0.0203424  -6.345 2.89e-10 ***
## Amount_purchased  0.0003518  0.0001122   3.135 0.001753 ** 
## Frequency        -0.0157943  0.0019980  -7.905 4.97e-15 ***
## First_purchase    0.0046036  0.0015884   2.898 0.003803 ** 
## P_Child          -0.0502183  0.0126891  -3.958 7.90e-05 ***
## P_Youth          -0.0225339  0.0175326  -1.285 0.198888    
## P_Cook           -0.0667467  0.0131127  -5.090 4.00e-07 ***
## P_DIY            -0.0606486  0.0170835  -3.550 0.000396 ***
## P_Art             0.1916012  0.0167447  11.443  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3847 on 1590 degrees of freedom
## Multiple R-squared:  0.2156, Adjusted R-squared:  0.2111 
## F-statistic: 48.55 on 9 and 1590 DF,  p-value: < 2.2e-16

Check diagnostic plots:

par(mfrow=c(2,2))
plot(lm1.novif, which=c(1:4))

Evidence of non-normality.

Linear Regression with stepwise selection:

lm1.stepwise <- ols_step_both_p(lm1, pent = 0.05, prem = 0.05, details = F)
lm1.stepwise

## 
##                                   Stepwise Selection Summary                                    
## -----------------------------------------------------------------------------------------------
##                              Added/                   Adj.                                         
## Step        Variable        Removed     R-Square    R-Square      C(p)         AIC        RMSE     
## -----------------------------------------------------------------------------------------------
##    1         P_Art          addition       0.128       0.127    227.4360    1649.2034    0.4046    
##    2       Frequency        addition       0.170       0.169    142.0340    1572.6125    0.3949    
##    3         Gender         addition       0.188       0.186    106.5840    1539.7192    0.3908    
##    4         P_Cook         addition       0.200       0.198     82.3710    1516.8351    0.3879    
##    5         P_DIY          addition       0.204       0.201     77.1250    1511.8878    0.3871    
##    6    Amount_purchased    addition       0.208       0.205     70.8160    1505.8838    0.3863    
##    7        P_Child         addition       0.211       0.208     64.8840    1500.2051    0.3855    
##    8     Last_purchase      addition       0.228       0.224     31.7060    1467.7006    0.3815    
##    9        P_Youth         addition       0.239       0.235     10.2160    1446.2388    0.3788    
## -----------------------------------------------------------------------------------------------

lm1.step <- lm(as.numeric(Choice)~ P_Art + Frequency + Gender + P_Cook + P_DIY + Amount_purchased + P_Child + Last_purchase + P_Youth, BBBC.train)
summary(lm1.step)

## 
## Call:
## lm(formula = as.numeric(Choice) ~ P_Art + Frequency + Gender + 
##     P_Cook + P_DIY + Amount_purchased + P_Child + Last_purchase + 
##     P_Youth, data = BBBC.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9802 -0.2452 -0.1157  0.1655  1.0595 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1.3727367  0.0297590  46.128  < 2e-16 ***
## P_Art             0.1150034  0.0192719   5.967 2.97e-09 ***
## Frequency        -0.0110830  0.0012128  -9.138  < 2e-16 ***
## Gender1          -0.1316464  0.0200208  -6.575 6.56e-11 ***
## P_Cook           -0.1433497  0.0165218  -8.676  < 2e-16 ***
## P_DIY            -0.1365578  0.0197520  -6.914 6.82e-12 ***
## Amount_purchased  0.0002742  0.0001110   2.470   0.0136 *  
## P_Child          -0.1275991  0.0163571  -7.801 1.11e-14 ***
## Last_purchase     0.0894288  0.0116772   7.658 3.25e-14 ***
## P_Youth          -0.0973642  0.0200903  -4.846 1.38e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3788 on 1590 degrees of freedom
## Multiple R-squared:  0.2395, Adjusted R-squared:  0.2352 
## F-statistic: 55.63 on 9 and 1590 DF,  p-value: < 2.2e-16

There is enough statistical evidence for us to reject the null hypothesis that all model coefficients are equal to zero.

Diagnostics:

par(mfrow=c(2,2))
plot(lm1.step, which=c(1:4))

Check VIF again:

vif(lm1.step)

##            P_Art        Frequency           Gender           P_Cook 
##         2.233698         1.007855         1.004715         3.290657 
##            P_DIY Amount_purchased          P_Child    Last_purchase 
##         2.009454         1.248033         3.341880        13.920175 
##          P_Youth 
##         1.771355

“Last_purchase” has a very high VIF value, and the QQ plot shows non-normality.

LOGIT:

glm.base <- glm(Choice ~ ., BBBC.pseudo.train, family = "binomial")
vif(glm.base)

##           Gender Amount_purchased        Frequency    Last_purchase 
##         1.027333         1.231797         2.531715        17.150066 
##   First_purchase          P_Child          P_Youth           P_Cook 
##         9.337105         3.080295         1.831745         3.258309 
##            P_DIY            P_Art 
##         2.100789         2.048933

Remove “Last Purchase” and try again.

glm.base<-glm(Choice ~ . - Last_purchase, BBBC.pseudo.train, family="binomial")
vif(glm.base)

##           Gender Amount_purchased        Frequency   First_purchase 
##         1.027489         1.215065         2.219704         7.129562 
##          P_Child          P_Youth           P_Cook            P_DIY 
##         1.916577         1.392179         1.999162         1.492414 
##            P_Art 
##         1.656146

Remove “First purchase” and try again:

glm.base <- glm(Choice ~ . - Last_purchase - First_purchase, BBBC.pseudo.train, family = "binomial")
vif(glm.base)

##           Gender Amount_purchased        Frequency          P_Child 
##         1.028240         1.209582         1.014939         1.226670 
##          P_Youth           P_Cook            P_DIY            P_Art 
##         1.084659         1.252361         1.194367         1.251723

Run our final model:

glm.null <- glm(Choice ~ 1, BBBC.pseudo.train, family = "binomial")
glm.full <- glm(Choice ~ . - Last_purchase - First_purchase, BBBC.pseudo.train, family = "binomial")
glm.step <- step(glm.null, scope = list(upper = glm.full),
                 direction = "both", test="Chisq", trace = F)
summary(glm.step)

## 
## Call:
## glm(formula = Choice ~ P_Art + Frequency + Gender + P_Cook + 
##     Amount_purchased + P_DIY + P_Child, family = "binomial", 
##     data = BBBC.pseudo.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2809  -0.6880  -0.4703  -0.1612   2.6014  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.4675401  0.2439857  -1.916 0.055332 .  
## P_Art             1.2303020  0.1174228  10.478  < 2e-16 ***
## Frequency        -0.0866254  0.0122194  -7.089 1.35e-12 ***
## Gender1          -0.7690962  0.1617603  -4.755 1.99e-06 ***
## P_Cook           -0.3072905  0.0883607  -3.478 0.000506 ***
## Amount_purchased  0.0027430  0.0009229   2.972 0.002956 ** 
## P_DIY            -0.2677571  0.1246974  -2.147 0.031773 *  
## P_Child          -0.1728451  0.0859291  -2.011 0.044274 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1248.5  on 1119  degrees of freedom
## Residual deviance: 1006.8  on 1112  degrees of freedom
## AIC: 1022.8
## 
## Number of Fisher Scoring iterations: 5

Goodness of Fit:

hoslem.test(glm.step$y, fitted(glm.step), g=10)

## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  glm.step$y, fitted(glm.step)
## X-squared = 4.5158, df = 8, p-value = 0.8078

Here we see that the p-value of the Hosmer Lemeshow test indicates we fail to reject the null hypothesis, so we can say that our stepwise logistical model fits the data well

Check for influential points with Cook’s Distance:

plot(glm.step, which = 4)

glm.step.probs <- predict(glm.step, BBBC.pseudo.test, type = "response")
glm.step.preds <- rep(0, length(glm.step.probs))
glm.step.preds[glm.step.probs > 0.5] = 1

caret::confusionMatrix(as.factor(glm.step.preds), BBBC.pseudo.test$Choice, positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 341  87
##          1  14  38
##                                           
##                Accuracy : 0.7896          
##                  95% CI : (0.7503, 0.8252)
##     No Information Rate : 0.7396          
##     P-Value [Acc > NIR] : 0.00638         
##                                           
##                   Kappa : 0.3263          
##                                           
##  Mcnemar's Test P-Value : 7.82e-13        
##                                           
##             Sensitivity : 0.30400         
##             Specificity : 0.96056         
##          Pos Pred Value : 0.73077         
##          Neg Pred Value : 0.79673         
##              Prevalence : 0.26042         
##          Detection Rate : 0.07917         
##    Detection Prevalence : 0.10833         
##       Balanced Accuracy : 0.63228         
##                                           
##        'Positive' Class : 1               
##

78.96% accuracy.

Compare against the test set:

glm.step.probs <- predict(glm.step, BBBC.test, type = "response")
glm.step.preds <- rep(0, length(glm.step.probs))
glm.step.preds[glm.step.probs > 0.5] = 1

caret::confusionMatrix(as.factor(glm.step.preds), BBBC.test$Choice, positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1990  133
##          1  106   71
##                                           
##                Accuracy : 0.8961          
##                  95% CI : (0.8829, 0.9083)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 0.99461         
##                                           
##                   Kappa : 0.3164          
##                                           
##  Mcnemar's Test P-Value : 0.09261         
##                                           
##             Sensitivity : 0.34804         
##             Specificity : 0.94943         
##          Pos Pred Value : 0.40113         
##          Neg Pred Value : 0.93735         
##              Prevalence : 0.08870         
##          Detection Rate : 0.03087         
##    Detection Prevalence : 0.07696         
##       Balanced Accuracy : 0.64873         
##                                           
##        'Positive' Class : 1               
##

89.61% accuracy

Profit with un-optimized GLM:

VC.mail.posting <- 0.65
VC.book <- 15
VC.book.OH <- VC.book*0.45
P.book <- 31.95
Prof.book <- P.book - VC.mail.posting - VC.book - VC.book.OH

N.Cust <- 2300
N.booksold <- 74
N.mailing <- 106

Profit.unOpt <- N.booksold * (Prof.book) - N.mailing * (VC.mail.posting)
Prof.per.Cust.unOpt <- Profit.unOpt / N.Cust
Profit.unOpt

## [1] 637.8

$637.8

Prof.per.Cust.unOpt

## [1] 0.2773043

28 cents per customer.

Tuning the selection threshold:

pr = predict(glm.step, BBBC.pseudo.test, type = "response") 
response = BBBC.pseudo.test$Choice

pred <- prediction(pr, response)


auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)


false.rates <-performance(pred, "fpr","fnr")
accuracy <-performance(pred, "acc","err")
perf <- performance(pred, "tpr","fpr")


plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))

plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values), 
     type="l", lwd=2, 
     ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))

par(new=TRUE)


plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values), 
     type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, col='red')


min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x

abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,2)), pos = 4)

optimal prediction threshold is 0.22

Check confusion matrix:

glm.step.probs <- predict(glm.step, BBBC.test, type = "response")
glm.step.preds <- rep(0, length(glm.step.probs))
glm.step.preds[glm.step.probs > 0.22] = 1

caret::confusionMatrix(as.factor(glm.step.preds), BBBC.test$Choice, positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1490   57
##          1  606  147
##                                           
##                Accuracy : 0.7117          
##                  95% CI : (0.6927, 0.7302)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1948          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.72059         
##             Specificity : 0.71088         
##          Pos Pred Value : 0.19522         
##          Neg Pred Value : 0.96315         
##              Prevalence : 0.08870         
##          Detection Rate : 0.06391         
##    Detection Prevalence : 0.32739         
##       Balanced Accuracy : 0.71573         
##                                           
##        'Positive' Class : 1               
##

71.17% accuracy

Profit estimation:

N.Cust <- 2300
N.booksold <- 149
N.mailing <- 634

Profit.Opt <- N.booksold * (Prof.book) - N.mailing * (VC.mail.posting)
Prof.per.Cust.Opt <- Profit.Opt / N.Cust
Profit.Opt

## [1] 1010.85

$1,010.85

Prof.per.Cust.Opt

## [1] 0.4395

43.95 cents per customer.

Percent improvement of optimized to un-optimized GLM:

glm.tuned.pct.imp <- ((Profit.Opt - Profit.unOpt) / Profit.unOpt) * 100

glm.tuned.pct.imp

## [1] 58.49012

“1” Test set:

Test.Choice.1 <- sum(BBBC.test$Choice == 1)
Test.Choice.1

## [1] 204

Test.Choice.0 <- sum(BBBC.test$Choice == 0)
Test.Choice.0

## [1] 2096

Test.Choice.0+Test.Choice.1

## [1] 2300

“1” Training set:

Training.Choice.1 <- sum(BBBC.train$Choice == 1)
Training.Choice.1

## [1] 400

Training.Choice.0 <- sum(BBBC.train$Choice == 0)
Training.Choice.0

## [1] 1200

Training.Choice.0+Training.Choice.1

## [1] 1600

If Bookbinders used no model at all:

N.Cust <- 2300
N.booksold <- 204
N.mailing <- 2096

Profit.NoMod <- N.booksold * (Prof.book) - N.mailing * (VC.mail.posting)
Prof.per.Cust.NoMod <- Profit.NoMod / N.Cust
Profit.NoMod

## [1] 585.8

$585.80

Prof.per.Cust.NoMod

## [1] 0.2546957

25.47 cents per customer.

glm.tuned.pct.boost <- ((Profit.Opt - Profit.NoMod) / Profit.NoMod) * 100

glm.tuned.pct.boost

## [1] 72.55889

Our model boosts profit by 72.55%.

Tune the model to minimize its error:

cut.seq <- seq(0.01, 0.99, by = 0.01)

seq.glm <- glm(Choice ~ P_Art + Frequency + Gender + P_Cook + P_Child + P_DIY, family = "binomial", data = BBBC.pseudo.train)

Run the new GLM:

err=c(); 
for(i in 1:length(cut.seq)){
  
  est.prob = predict(seq.glm, newdata=BBBC.pseudo.test[ ,-1], type="response")
  est.class = ifelse(est.prob > cut.seq[i], 1, 0)
  
  
  err[i] = mean(BBBC.pseudo.test.true != est.class, na.rm=TRUE)
}

par(mfrow=c(1,1))
plot(cut.seq, err, type="l")

min(err)

## [1] 0.20625

cut.seq[which.min(err)]

## [1] 0.31

final.cut = cut.seq[which.min(err)]

seq.glm.probs <- predict(seq.glm, BBBC.test, type = "response")

seq.glm.preds <- rep(0, length(seq.glm.probs))
seq.glm.preds[seq.glm.probs > final.cut] = 1

caret::confusionMatrix(as.factor(seq.glm.preds), BBBC.test$Choice, positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1759   93
##          1  337  111
##                                           
##                Accuracy : 0.813           
##                  95% CI : (0.7965, 0.8288)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2489          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.54412         
##             Specificity : 0.83922         
##          Pos Pred Value : 0.24777         
##          Neg Pred Value : 0.94978         
##              Prevalence : 0.08870         
##          Detection Rate : 0.04826         
##    Detection Prevalence : 0.19478         
##       Balanced Accuracy : 0.69167         
##                                           
##        'Positive' Class : 1               
##

81.3% accuracy

N.Cust <- 2300
N.booksold <- 59
N.mailing <- 73

Profit.Opt <- N.booksold * (Prof.book) - N.mailing * (VC.mail.posting)
Prof.per.Cust.Opt <- Profit.Opt / N.Cust
Profit.Opt

## [1] 516

$516

Prof.per.Cust.Opt

## [1] 0.2243478

22.43 centers per customer.

SVM:

tuned.svm <- tune.svm(Choice ~ ., data=BBBC.train, kernel = 'linear',
                      gamma = seq(0.01, 0.1, by = 0.025),
                      cost = seq(0.1, 1.2, by=0.1), scale=TRUE)
tuned.svm$best.parameters

SVM with tuned parameters:

svm1 <- svm(Choice ~ ., data = BBBC.train, kernel = 'linear',
            gamma = tuned.svm$best.parameters$gamma,
            cost = tuned.svm$best.parameters$cost)

Checking predictions for SVM1 against the training data:

svm.preds <- predict(svm1, BBBC.train)

caret::confusionMatrix(svm.preds, BBBC.train$Choice, positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1149  270
##          1   51  130
##                                           
##                Accuracy : 0.7994          
##                  95% CI : (0.7789, 0.8187)
##     No Information Rate : 0.75            
##     P-Value [Acc > NIR] : 1.765e-06       
##                                           
##                   Kappa : 0.3456          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.32500         
##             Specificity : 0.95750         
##          Pos Pred Value : 0.71823         
##          Neg Pred Value : 0.80973         
##              Prevalence : 0.25000         
##          Detection Rate : 0.08125         
##    Detection Prevalence : 0.11313         
##       Balanced Accuracy : 0.64125         
##                                           
##        'Positive' Class : 1               
##

79.88% accuracy

Running SVM on testing data:

svm.preds <- predict(svm1, BBBC.test)
caret::confusionMatrix(svm.preds, BBBC.test$Choice, positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2015  147
##          1   81   57
##                                           
##                Accuracy : 0.9009          
##                  95% CI : (0.8879, 0.9128)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 0.9621          
##                                           
##                   Kappa : 0.2819          
##                                           
##  Mcnemar's Test P-Value : 1.672e-05       
##                                           
##             Sensitivity : 0.27941         
##             Specificity : 0.96135         
##          Pos Pred Value : 0.41304         
##          Neg Pred Value : 0.93201         
##              Prevalence : 0.08870         
##          Detection Rate : 0.02478         
##    Detection Prevalence : 0.06000         
##       Balanced Accuracy : 0.62038         
##                                           
##        'Positive' Class : 1               
##

89.91% accuracy

Profit numbers on linear kernel SVM:

N.Cust <- 2300
N.booksold <- 57
N.mailing <- 85

Profit.LSVM <- N.booksold * (Prof.book) - N.mailing * (VC.mail.posting)
Prof.per.Cust.LSVM <- Profit.LSVM / N.Cust
Profit.LSVM

## [1] 489.1

$489.10

Prof.per.Cust.LSVM

## [1] 0.2126522

21.27 cents per customer

Tuning on RBF:

tuned.svm.rad <- tune.svm(Choice ~ ., data=BBBC.train, kernel = 'radial',
                      gamma = seq(0.01, 0.1, by = 0.01),
                      cost = seq(0.1, 1.2, by=0.25), scale=TRUE)

tuned.svm.rad$best.parameters

svm2 <- svm(Choice ~ ., data = BBBC.train, kernel = 'radial',
            gamma = tuned.svm.rad$best.parameters$gamma,
            cost = tuned.svm.rad$best.parameters$cost)

svm.preds.rad <- predict(svm2, BBBC.test)
caret::confusionMatrix(svm.preds.rad, BBBC.test$Choice, positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2059  175
##          1   37   29
##                                           
##                Accuracy : 0.9078          
##                  95% CI : (0.8953, 0.9193)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 0.7355          
##                                           
##                   Kappa : 0.1792          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.14216         
##             Specificity : 0.98235         
##          Pos Pred Value : 0.43939         
##          Neg Pred Value : 0.92167         
##              Prevalence : 0.08870         
##          Detection Rate : 0.01261         
##    Detection Prevalence : 0.02870         
##       Balanced Accuracy : 0.56225         
##                                           
##        'Positive' Class : 1               
##

90.78% accuracy

Profit:

N.Cust <- 2300
N.booksold <- 38
N.mailing <- 42

Profit.RSVM <- N.booksold * (Prof.book) - N.mailing * (VC.mail.posting)
Prof.per.Cust.RSVM <- Profit.RSVM / N.Cust
Profit.RSVM

## [1] 335.6

$335.6

Prof.per.Cust.RSVM

## [1] 0.145913

14.59 cents per customer

Evaluating a polynomial kernel SVM:

tuned.svm.poly <- tune.svm(Choice ~ ., data=BBBC.train, kernel = 'polynomial',
                      gamma = seq(0.01, 0.1, by = 0.01),
                      cost = seq(0.1, 1.2, by=0.25), scale=TRUE)

tuned.svm.poly$best.parameters

svm3 <- svm(Choice ~ ., data = BBBC.train, kernel = 'polynomial',
            gamma = tuned.svm.poly$best.parameters$gamma,
            cost = tuned.svm.poly$best.parameters$cost)

svm.preds.poly <- predict(svm3, BBBC.test)
caret::confusionMatrix(svm.preds.poly, BBBC.test$Choice, positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2052  174
##          1   44   30
##                                           
##                Accuracy : 0.9052          
##                  95% CI : (0.8925, 0.9169)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 0.856           
##                                           
##                   Kappa : 0.177           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.14706         
##             Specificity : 0.97901         
##          Pos Pred Value : 0.40541         
##          Neg Pred Value : 0.92183         
##              Prevalence : 0.08870         
##          Detection Rate : 0.01304         
##    Detection Prevalence : 0.03217         
##       Balanced Accuracy : 0.56303         
##                                           
##        'Positive' Class : 1               
##

90.52% accuracy.

SVM with poly kernel profit numbers:

N.Cust <- 2300
N.booksold <- 36
N.mailing <- 47

Profit.PSVM <- N.booksold * (Prof.book) - N.mailing * (VC.mail.posting)
Prof.per.Cust.PSVM <- Profit.PSVM / N.Cust
Profit.PSVM

## [1] 313.25

$313.25

Prof.per.Cust.PSVM

## [1] 0.1361957

13.61 cents per customer.

Case 2 - Bookbinders

Michael King

10/7/2021