Bookbinders Case Study

Load Data

book_train <- read_excel("BBBC-Train.xlsx")
book_test <- read_excel("BBBC-Test.xlsx")
book_train = book_train[,-1]
book_test = book_test[,-1]
str(book_train)

## tibble [1,600 x 11] (S3: tbl_df/tbl/data.frame)
##  $ Choice          : num [1:1600] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Gender          : num [1:1600] 1 1 1 1 0 1 1 0 1 1 ...
##  $ Amount_purchased: num [1:1600] 113 418 336 180 320 268 198 280 393 138 ...
##  $ Frequency       : num [1:1600] 8 6 18 16 2 4 2 6 12 10 ...
##  $ Last_purchase   : num [1:1600] 1 11 6 5 3 1 12 2 11 7 ...
##  $ First_purchase  : num [1:1600] 8 66 32 42 18 4 62 12 50 38 ...
##  $ P_Child         : num [1:1600] 0 0 2 2 0 0 2 0 3 2 ...
##  $ P_Youth         : num [1:1600] 1 2 0 0 0 0 3 2 0 3 ...
##  $ P_Cook          : num [1:1600] 0 3 1 0 0 0 2 0 3 0 ...
##  $ P_DIY           : num [1:1600] 0 2 1 1 1 0 1 0 0 0 ...
##  $ P_Art           : num [1:1600] 0 3 2 1 2 0 2 0 2 1 ...

Exploratory Analysis

Scatter plot of all variables

pairs(book_train)

Correlation Analysis

cor(book_train)

##                        Choice       Gender Amount_purchased     Frequency
## Choice            1.000000000 -0.141558415       0.11815256 -0.2260181193
## Gender           -0.141558415  1.000000000      -0.03060700  0.0321704951
## Amount_purchased  0.118152563 -0.030607000       1.00000000  0.0136664846
## Frequency        -0.226018119  0.032170495       0.01366648  1.0000000000
## Last_purchase     0.141437015 -0.028963412       0.44070127 -0.0419432803
## First_purchase    0.003157481  0.001026138       0.37481393  0.4459457457
## P_Child           0.008523377 -0.041475936       0.29931372 -0.0433279437
## P_Youth           0.027608101 -0.014130306       0.18755727 -0.0095854745
## P_Cook           -0.040256351 -0.026673876       0.30425340  0.0004968833
## P_DIY            -0.005309265 -0.025946174       0.22331539 -0.0089634125
## P_Art             0.357688817 -0.003500037       0.27248948 -0.0613754066
##                  Last_purchase First_purchase      P_Child      P_Youth
## Choice              0.14143702    0.003157481  0.008523377  0.027608101
## Gender             -0.02896341    0.001026138 -0.041475936 -0.014130306
## Amount_purchased    0.44070127    0.374813928  0.299313719  0.187557270
## Frequency          -0.04194328    0.445945746 -0.043327944 -0.009585474
## Last_purchase       1.00000000    0.814674687  0.679133923  0.453258910
## First_purchase      0.81467469    1.000000000  0.544820825  0.367892128
## P_Child             0.67913392    0.544820825  1.000000000  0.174826719
## P_Youth             0.45325891    0.367892128  0.174826719  1.000000000
## P_Cook              0.67250539    0.571054792  0.294706519  0.181656640
## P_DIY               0.55816739    0.462018843  0.253837077  0.188683456
## P_Art               0.53433415    0.442082061  0.224512850  0.141751220
##                         P_Cook        P_DIY        P_Art
## Choice           -0.0402563507 -0.005309265  0.357688817
## Gender           -0.0266738763 -0.025946174 -0.003500037
## Amount_purchased  0.3042533969  0.223315392  0.272489483
## Frequency         0.0004968833 -0.008963412 -0.061375407
## Last_purchase     0.6725053933  0.558167395  0.534334145
## First_purchase    0.5710547918  0.462018843  0.442082061
## P_Child           0.2947065185  0.253837077  0.224512850
## P_Youth           0.1816566401  0.188683456  0.141751220
## P_Cook            1.0000000000  0.271725126  0.191680761
## P_DIY             0.2717251256  1.000000000  0.207791065
## P_Art             0.1916807611  0.207791065  1.000000000

Check for missing values

book_train = na.omit(book_train)
book_test = na.omit(book_test)

Plotting the response variable

book_train %>% 
  ggplot(aes(x = factor(ifelse(Choice == 1, "Book Purchased", "No Purchase" )), 
             fill = factor(ifelse(Gender == 0, "Female", "Male")))) +
  geom_bar(stat="count", alpha = 0.8) + 
  stat_count(geom = "text", colour = "black", size = 3.5,
             aes(label = paste("n = ", ..count..)),
             position=position_stack(vjust=0.5)) +
  labs(title = "Choice of Book Purchase Gender", x= "", y= "", fill="Gender")

a) Linear Regression

book_lm1 = lm(Choice ~ ., data = book_train)
summary(book_lm1)

## 
## Call:
## lm(formula = Choice ~ ., data = book_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9603 -0.2462 -0.1161  0.1622  1.0588 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.3642284  0.0307411  11.848  < 2e-16 ***
## Gender           -0.1309205  0.0200303  -6.536 8.48e-11 ***
## Amount_purchased  0.0002736  0.0001110   2.464   0.0138 *  
## Frequency        -0.0090868  0.0021791  -4.170 3.21e-05 ***
## Last_purchase     0.0970286  0.0135589   7.156 1.26e-12 ***
## First_purchase   -0.0020024  0.0018160  -1.103   0.2704    
## P_Child          -0.1262584  0.0164011  -7.698 2.41e-14 ***
## P_Youth          -0.0963563  0.0201097  -4.792 1.81e-06 ***
## P_Cook           -0.1414907  0.0166064  -8.520  < 2e-16 ***
## P_DIY            -0.1352313  0.0197873  -6.834 1.17e-11 ***
## P_Art             0.1178494  0.0194427   6.061 1.68e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3788 on 1589 degrees of freedom
## Multiple R-squared:  0.2401, Adjusted R-squared:  0.2353 
## F-statistic:  50.2 on 10 and 1589 DF,  p-value: < 2.2e-16

Verify for any correlation

vif(book_lm1)

##           Gender Amount_purchased        Frequency    Last_purchase 
##         1.005801         1.248066         3.253860        18.770402 
##   First_purchase          P_Child          P_Youth           P_Cook 
##         9.685333         3.360349         1.775022         3.324928 
##            P_DIY            P_Art 
##         2.016910         2.273771

Run 2nd Linear Regression Model

(after removing the variables with high corralation)

book_lm2 = lm(Choice ~ .-Last_purchase , data = book_train)
summary(book_lm2)

## 
## Call:
## lm(formula = Choice ~ . - Last_purchase, data = book_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0018 -0.2482 -0.1277  0.1567  1.1035 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.3926595  0.0309609  12.682  < 2e-16 ***
## Gender           -0.1290720  0.0203424  -6.345 2.89e-10 ***
## Amount_purchased  0.0003518  0.0001122   3.135 0.001753 ** 
## Frequency        -0.0157943  0.0019980  -7.905 4.97e-15 ***
## First_purchase    0.0046036  0.0015884   2.898 0.003803 ** 
## P_Child          -0.0502183  0.0126891  -3.958 7.90e-05 ***
## P_Youth          -0.0225339  0.0175326  -1.285 0.198888    
## P_Cook           -0.0667467  0.0131127  -5.090 4.00e-07 ***
## P_DIY            -0.0606486  0.0170835  -3.550 0.000396 ***
## P_Art             0.1916012  0.0167447  11.443  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3847 on 1590 degrees of freedom
## Multiple R-squared:  0.2156, Adjusted R-squared:  0.2111 
## F-statistic: 48.55 on 9 and 1590 DF,  p-value: < 2.2e-16

Verify again for correlation

vif(book_lm2)

##           Gender Amount_purchased        Frequency   First_purchase 
##         1.005634         1.235982         2.651820         7.182666 
##          P_Child          P_Youth           P_Cook            P_DIY 
##         1.949849         1.307915         2.009609         1.457362 
##            P_Art 
##         1.634878

Run 3rd Linear Regression Model

(after removing “First_purchase” which still has a VIF > 5.0)

book_lm3 = lm(Choice ~ .-Last_purchase - First_purchase , data = book_train)
summary(book_lm3)

## 
## Call:
## lm(formula = Choice ~ . - Last_purchase - First_purchase, data = book_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9501 -0.2518 -0.1273  0.1509  1.1211 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.3731865  0.0302933  12.319  < 2e-16 ***
## Gender           -0.1263728  0.0203683  -6.204 6.99e-10 ***
## Amount_purchased  0.0003688  0.0001123   3.283  0.00105 ** 
## Frequency        -0.0112345  0.0012344  -9.101  < 2e-16 ***
## P_Child          -0.0275983  0.0100284  -2.752  0.00599 ** 
## P_Youth          -0.0014841  0.0159946  -0.093  0.92609    
## P_Cook           -0.0428346  0.0102155  -4.193 2.90e-05 ***
## P_DIY            -0.0384262  0.0153017  -2.511  0.01213 *  
## P_Art             0.2183323  0.0140081  15.586  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3856 on 1591 degrees of freedom
## Multiple R-squared:  0.2114, Adjusted R-squared:  0.2075 
## F-statistic: 53.32 on 8 and 1591 DF,  p-value: < 2.2e-16

Final correlation check for Linear Regression

vif(book_lm3)

##           Gender Amount_purchased        Frequency          P_Child 
##         1.003526         1.232595         1.007587         1.212223 
##          P_Youth           P_Cook            P_DIY            P_Art 
##         1.083475         1.214043         1.163794         1.138879

Diagnostics plots for Model 3 - Linear Regression

par(mfrow = c(2,2))
plot(book_lm3)

Comments: We can see by both the Residuals and the standardized residuals that they do not follow a normal distribution but a binomial distribution.

Predictions for Model 3 - Linear Regression

book_test$PredProb = predict(book_lm3, newdata = book_test, type = "response")
book_test$Choice = as.factor(book_test$Choice)

PredProb Model 3 - Linear Regression

book_test$PredChoice = ifelse(book_test$PredProb >= 0.5, 1, 0)
table(book_test$PredChoice)

## 
##    0    1 
## 2146  154

Confusion matrix for Model 3 - Linear Regression

caret::confusionMatrix(book_test$Choice,as.factor(book_test$PredChoice))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2005   91
##          1  141   63
##                                           
##                Accuracy : 0.8991          
##                  95% CI : (0.8861, 0.9111)
##     No Information Rate : 0.933           
##     P-Value [Acc > NIR] : 1.000000        
##                                           
##                   Kappa : 0.2984          
##                                           
##  Mcnemar's Test P-Value : 0.001295        
##                                           
##             Sensitivity : 0.9343          
##             Specificity : 0.4091          
##          Pos Pred Value : 0.9566          
##          Neg Pred Value : 0.3088          
##              Prevalence : 0.9330          
##          Detection Rate : 0.8717          
##    Detection Prevalence : 0.9113          
##       Balanced Accuracy : 0.6717          
##                                           
##        'Positive' Class : 0               
##

b) Logistic Regression

book_logr = glm(Choice ~., data = book_train, family = binomial)
summary(book_logr)

## 
## Call:
## glm(formula = Choice ~ ., family = binomial, data = book_train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.38586  -0.66728  -0.43696  -0.02242   2.72238  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.3515281  0.2143839  -1.640   0.1011    
## Gender           -0.8632319  0.1374499  -6.280 3.38e-10 ***
## Amount_purchased  0.0018641  0.0007918   2.354   0.0186 *  
## Frequency        -0.0755142  0.0165937  -4.551 5.35e-06 ***
## Last_purchase     0.6117713  0.0938127   6.521 6.97e-11 ***
## First_purchase   -0.0147792  0.0128027  -1.154   0.2483    
## P_Child          -0.8112489  0.1167067  -6.951 3.62e-12 ***
## P_Youth          -0.6370422  0.1433778  -4.443 8.87e-06 ***
## P_Cook           -0.9230066  0.1194814  -7.725 1.12e-14 ***
## P_DIY            -0.9058697  0.1437025  -6.304 2.90e-10 ***
## P_Art             0.6861124  0.1270176   5.402 6.60e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1799.5  on 1599  degrees of freedom
## Residual deviance: 1392.2  on 1589  degrees of freedom
## AIC: 1414.2
## 
## Number of Fisher Scoring iterations: 5

Verify Collinearity Model 0 - Logistic Regression

vif(book_logr)

##           Gender Amount_purchased        Frequency    Last_purchase 
##         1.023359         1.232172         2.490447        17.706670 
##   First_purchase          P_Child          P_Youth           P_Cook 
##         9.247748         2.992269         1.761546         3.229097 
##            P_DIY            P_Art 
##         1.992698         1.938089

Model 1 - after removing collinearity

book_logr1 = glm(Choice ~.- Last_purchase, data = book_train, family = binomial)
summary(book_logr1)

## 
## Call:
## glm(formula = Choice ~ . - Last_purchase, family = binomial, 
##     data = book_train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.46171  -0.68074  -0.46620  -0.00855   2.80519  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.1489829  0.2095375  -0.711 0.477079    
## Gender           -0.8302649  0.1350384  -6.148 7.83e-10 ***
## Amount_purchased  0.0022691  0.0007747   2.929 0.003399 ** 
## Frequency        -0.1194992  0.0152620  -7.830 4.89e-15 ***
## First_purchase    0.0306235  0.0108454   2.824 0.004748 ** 
## P_Child          -0.3456948  0.0908420  -3.805 0.000142 ***
## P_Youth          -0.1789417  0.1226235  -1.459 0.144489    
## P_Cook           -0.4578299  0.0950443  -4.817 1.46e-06 ***
## P_DIY            -0.4265209  0.1209960  -3.525 0.000423 ***
## P_Art             1.0778036  0.1144995   9.413  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1799.5  on 1599  degrees of freedom
## Residual deviance: 1437.0  on 1590  degrees of freedom
## AIC: 1457
## 
## Number of Fisher Scoring iterations: 5

Verify for collinearity Model 1 - Linear Regression Model

vif(book_logr1)

##           Gender Amount_purchased        Frequency   First_purchase 
##         1.021977         1.220305         2.173240         6.886806 
##          P_Child          P_Youth           P_Cook            P_DIY 
##         1.904631         1.320305         2.060140         1.462770 
##            P_Art 
##         1.603865

Model 2 - Logistic Regression Model

book_logr2 = glm(Choice ~.- Last_purchase - First_purchase, data = book_train, family = binomial)
summary(book_logr2)

## 
## Call:
## glm(formula = Choice ~ . - Last_purchase - First_purchase, family = binomial, 
##     data = book_train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.31846  -0.69097  -0.47171  -0.02488   2.84182  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.286380   0.202966  -1.411  0.15825    
## Gender           -0.811948   0.134579  -6.033 1.61e-09 ***
## Amount_purchased  0.002406   0.000771   3.120  0.00181 ** 
## Frequency        -0.088625   0.010385  -8.534  < 2e-16 ***
## P_Child          -0.194796   0.072207  -2.698  0.00698 ** 
## P_Youth          -0.031928   0.109605  -0.291  0.77082    
## P_Cook           -0.292392   0.072998  -4.005 6.19e-05 ***
## P_DIY            -0.279282   0.108094  -2.584  0.00977 ** 
## P_Art             1.245842   0.099062  12.576  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1799.5  on 1599  degrees of freedom
## Residual deviance: 1445.0  on 1591  degrees of freedom
## AIC: 1463
## 
## Number of Fisher Scoring iterations: 5

Final check for collinearity of Model 2

vif(book_logr2)

##           Gender Amount_purchased        Frequency          P_Child 
##         1.020217         1.213528         1.015899         1.215500 
##          P_Youth           P_Cook            P_DIY            P_Art 
##         1.081019         1.228798         1.179821         1.229491

Comments: No collinearity is left, so we can assume we can now take into consideration all these predictors for our final model.

Predictions: Model 2 - Log Regression

book_test$prob.logr <- predict.glm(book_logr2, newdata = book_test, type="response")

Convert probabilities to binary

book_test$PredChoice_logr = ifelse(book_test$prob.logr >= 0.5, 1, 0)
table(book_test$PredChoice_logr)

## 
##    0    1 
## 2116  184

Confussion Matrix Model 2 - Check the accuracy of the Logistic Regr. Model

confusionMatrix(as.factor(book_test$PredChoice_logr), as.factor(book_test$Choice), positive = '1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1984  132
##          1  112   72
##                                           
##                Accuracy : 0.8939          
##                  95% CI : (0.8806, 0.9062)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 0.9981          
##                                           
##                   Kappa : 0.3134          
##                                           
##  Mcnemar's Test P-Value : 0.2239          
##                                           
##             Sensitivity : 0.3529          
##             Specificity : 0.9466          
##          Pos Pred Value : 0.3913          
##          Neg Pred Value : 0.9376          
##              Prevalence : 0.0887          
##          Detection Rate : 0.0313          
##    Detection Prevalence : 0.0800          
##       Balanced Accuracy : 0.6498          
##                                           
##        'Positive' Class : 1               
##

Sensitivity vs. Specificity Plot

PredProb1 = prediction(predict.glm(book_logr2, newdata = book_test, type = "response"), book_test$Choice)

Optimal Cutoff for Model 2 - Log Reg Model

# Computing threshold for cutoff to best trade off sensitivity and specificity
plot(unlist(performance(PredProb1,'sens')@x.values),unlist(performance(PredProb1,'sens')@y.values), type='l', lwd=2, ylab = "y", xlab = 'Cutoff')
mtext('Sensitivity',side=2)
mtext('Sensitivity vs. Specificity Plot for AIC Model', side=3)

# Second specificity in same plot
par(new=TRUE)
plot(unlist(performance(PredProb1,'spec')@x.values),unlist(performance(PredProb1,'spec')@y.values), type='l', lwd=2,col='red', ylab = "", xlab = 'Cutoff')
axis(4,at=seq(0,1,0.2)) 
mtext('Specificity',side=4, col='red')

par(new=TRUE)

min.diff <-which.min(abs(unlist(performance(PredProb1, "sens")@y.values) - unlist(performance(PredProb1, "spec")@y.values)))
min.x<-unlist(performance(PredProb1, "sens")@x.values)[min.diff]
min.y<-unlist(performance(PredProb1, "spec")@y.values)[min.diff]
optimal <-min.x

abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)

Convert probabilities to binary for Optimal Cutoff

book_test$PredChoice_logr_ss = ifelse(book_test$prob.logr >= 0.23, 1, 0)
table(book_test$PredChoice_logr_ss)

## 
##    0    1 
## 1547  753

Confusion Matrix: Accuracy of Model 2 - Logistic Regression

confusionMatrix(as.factor(book_test$PredChoice_logr_ss), as.factor(book_test$Choice), positive = '1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1490   57
##          1  606  147
##                                           
##                Accuracy : 0.7117          
##                  95% CI : (0.6927, 0.7302)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1948          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.72059         
##             Specificity : 0.71088         
##          Pos Pred Value : 0.19522         
##          Neg Pred Value : 0.96315         
##              Prevalence : 0.08870         
##          Detection Rate : 0.06391         
##    Detection Prevalence : 0.32739         
##       Balanced Accuracy : 0.71573         
##                                           
##        'Positive' Class : 1               
##

c) SVM

book_train$Gender = as.factor(book_train$Gender)
book_train$Choice = as.factor(book_train$Choice)
book_test$Choice = as.factor(book_test$Choice)
book_test$Gender = as.factor(book_test$Gender)

set.seed(10)

Formula of full SVM Model (all variables)

form1 = Choice ~ .

SVM RBF Kernel

Tuning the RBF Kernel

tuned = tune.svm(form1, data = book_train, gamma = seq(0.001, 0.01, by = 0.005), cost = seq(0.1, 1, by = 0.1))

Finding the best parameters

tuned$best.parameters

Model using the values of the best parameters

book_svm = svm(formula = form1, data = book_train, gamma = tuned$best.parameters$gamma, cost = tuned$best.parameters$cost)
summary(book_svm)

## 
## Call:
## svm(formula = form1, data = book_train, gamma = tuned$best.parameters$gamma, 
##     cost = tuned$best.parameters$cost)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  787
## 
##  ( 391 396 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

Predictions on test set

svmpredict = predict(book_svm, newdata = book_test, type = "response")
caret::confusionMatrix(as.factor(svmpredict), book_test$Choice, positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2062  173
##          1   34   31
##                                           
##                Accuracy : 0.91            
##                  95% CI : (0.8976, 0.9214)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 0.6049          
##                                           
##                   Kappa : 0.196           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.15196         
##             Specificity : 0.98378         
##          Pos Pred Value : 0.47692         
##          Neg Pred Value : 0.92260         
##              Prevalence : 0.08870         
##          Detection Rate : 0.01348         
##    Detection Prevalence : 0.02826         
##       Balanced Accuracy : 0.56787         
##                                           
##        'Positive' Class : 1               
##

SVM Model Linear Kernel

Tuning the Linear kernel

tuned_linear = tune.svm(form1, data = book_train, 
                        gamma = seq(0.001, 0.01, by = 0.005), 
                        cost = seq(0.1, 1, by = 0.1),
                        kernel = "linear")

SVM using the values of the best parameters

tuned_linear$best.parameters

Fit the model

book_svm_linear = svm(formula = form1, data = book_train, gamma = tuned_linear$best.parameters$gamma, cost = tuned_linear$best.parameters$cost)
summary(book_svm_linear)

## 
## Call:
## svm(formula = form1, data = book_train, gamma = tuned_linear$best.parameters$gamma, 
##     cost = tuned_linear$best.parameters$cost)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  0.2 
## 
## Number of Support Vectors:  803
## 
##  ( 400 403 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

Predictions on test set

svmpredict_linear = predict(book_svm_linear, newdata = book_test, type = "response")
confusionMatrix(book_test$Choice, as.factor(svmpredict_linear), positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2096    0
##          1  204    0
##                                           
##                Accuracy : 0.9113          
##                  95% CI : (0.8989, 0.9226)
##     No Information Rate : 1               
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity :     NA          
##             Specificity : 0.9113          
##          Pos Pred Value :     NA          
##          Neg Pred Value :     NA          
##              Prevalence : 0.0000          
##          Detection Rate : 0.0000          
##    Detection Prevalence : 0.0887          
##       Balanced Accuracy :     NA          
##                                           
##        'Positive' Class : 1               
##

Profit analysis

nr_customers_tot = 50000
nr_test_customers = 2300
book.purchase.cost = 15
book.Overhead.perc = 0.45
book.price = 31.95
book.mailing.price = 0.65

book.profit = book.price - book.purchase.cost - book.purchase.cost*book.Overhead.perc
book.profit

## [1] 10.2

Profit with campaign for entire customers in database

book_order_perc = 0.0903   #Percentage of books ordered using traditional method with 20,000 customers.
(profit_entire_customers = 50000*book_order_perc*book.profit - 50000*(1-book_order_perc)*book.mailing.price) #profit

## [1] 16487.75

profit using logistics regression

pos_pred_value = 0.19522
(logreg_purchase_perc = (606 + 147)/2300) # percentage of book purchase predicted by Logistic Regression

## [1] 0.3273913

(nr_customers_logreg = round((logreg_purchase_perc*50000),0)) # number of customers targeted using Logistic Regression

## [1] 16370

(profit_log_reg = nr_customers_logreg*pos_pred_value*book.profit - nr_customers_logreg*(1-pos_pred_value)*book.mailing.price) #profit from LR model

## [1] 24033.4

profit using svm

pos_pred_value_svm = 0.47692
(svm_purchase_perc = (34 + 31)/2300) # percentage of book purchase predicted by SVM

## [1] 0.02826087

(nr_customers_svm = round((svm_purchase_perc*50000),0)) # number of customers targeted using SVM

## [1] 1413

(profit_svm = nr_customers_svm*pos_pred_value_svm*book.profit - nr_customers_svm*(1-pos_pred_value_svm)*book.mailing.price) #profit from SVM Model

## [1] 6393.234