Bookbinder Case Study

library(PerformanceAnalytics)

## Loading required package: xts

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## Attaching package: 'PerformanceAnalytics'

## The following object is masked from 'package:graphics':
## 
##     legend

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

library(car)

## Loading required package: carData

library(readxl)
library(e1071)

## 
## Attaching package: 'e1071'

## The following objects are masked from 'package:PerformanceAnalytics':
## 
##     kurtosis, skewness

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:xts':
## 
##     first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## -- Attaching packages ------------------------------------------------------------------------------------------------------------------------------------------ tidyverse 1.3.0 --

## v tibble  3.0.3     v purrr   0.3.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts --------------------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::first()  masks xts::first()
## x dplyr::lag()    masks stats::lag()
## x dplyr::last()   masks xts::last()
## x purrr::lift()   masks caret::lift()
## x dplyr::recode() masks car::recode()
## x purrr::some()   masks car::some()

library(broom)

BBBC_Train <- read_excel("BBBC-Train.xlsx")
BBBC_Test <- read_excel("BBBC-Test.xlsx")

#Removing Variable observation

BBBC_Train$Observation <- NULL
BBBC_Test$Observation <- NULL

#creating a copy of dataset

BBBC.T1 <- BBBC_Train

summary(BBBC.T1)

##      Choice         Gender       Amount_purchased   Frequency    
##  Min.   :0.00   Min.   :0.0000   Min.   : 15.0    Min.   : 2.00  
##  1st Qu.:0.00   1st Qu.:0.0000   1st Qu.:126.8    1st Qu.: 6.00  
##  Median :0.00   Median :1.0000   Median :203.0    Median :12.00  
##  Mean   :0.25   Mean   :0.6587   Mean   :200.9    Mean   :12.31  
##  3rd Qu.:0.25   3rd Qu.:1.0000   3rd Qu.:273.0    3rd Qu.:16.00  
##  Max.   :1.00   Max.   :1.0000   Max.   :474.0    Max.   :36.00  
##  Last_purchase    First_purchase     P_Child          P_Youth      
##  Min.   : 1.000   Min.   : 2.00   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 1.000   1st Qu.:12.00   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median : 2.000   Median :18.00   Median :0.0000   Median :0.0000  
##  Mean   : 3.199   Mean   :22.58   Mean   :0.7394   Mean   :0.3375  
##  3rd Qu.: 4.000   3rd Qu.:30.00   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :12.000   Max.   :96.00   Max.   :8.0000   Max.   :4.0000  
##      P_Cook         P_DIY            P_Art      
##  Min.   :0.00   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00   1st Qu.:0.0000   1st Qu.:0.000  
##  Median :0.00   Median :0.0000   Median :0.000  
##  Mean   :0.76   Mean   :0.3912   Mean   :0.425  
##  3rd Qu.:1.00   3rd Qu.:1.0000   3rd Qu.:1.000  
##  Max.   :6.00   Max.   :4.0000   Max.   :5.000

#Conveting variable into factor

BBBC.T1$Choice <-as.factor(BBBC_Train$Choice)
BBBC.T1$Gender <- as.factor(BBBC_Train$Gender)
BBBC_Test$Choice <- as.factor(BBBC_Test$Choice)
BBBC_Test$Gender <- as.factor(BBBC_Test$Gender)

#Explanatory Analysis

par(mfrow=c(2,2))
plot(BBBC.T1$Gender,BBBC.T1$Amount_purchased, col=c('Red','green'))
plot(BBBC.T1$Gender,BBBC.T1$P_Child, col=c('Red','green'))
plot(BBBC.T1$Gender,BBBC.T1$P_Youth, col=c('Red','green'))
plot(BBBC.T1$Gender,BBBC.T1$P_Cook, col=c('Red','green'))

plot(BBBC.T1$Gender,BBBC.T1$P_DIY, col=c('Red','green'))
plot(BBBC.T1$Gender,BBBC.T1$P_Art, col=c('Red','green'))
hist(BBBC.T1$P_Child, main = 'Number of Children Book Purchased')
hist(BBBC.T1$P_Youth, main = 'Number of Youth Book Purchased')

hist(BBBC.T1$P_Cook, main = 'Number of Cook Book Purchased')
hist(BBBC.T1$P_DIY, main = 'Number of DIY Book Purchased')
hist(BBBC.T1$P_Art, main = 'Number of Art Book Purchased')

##Stats and Facts - Calculated in Excel Total Female= 546 Total Male = 1054

Female | P_Child == 437 (total child book purchased by female) male | P_Child == 746 (total child book purchased by male)

Female | P_Youth == 191 (total Youth book purchased by female) male | P_Youth == 349 (total Youth book purchased by male)

Female | P_Cook == 436 (total Cook book purchased by female) male | P_Cook == 780 (total Cook book purchased by male)

Female | P_DIY == 227 (total DIY book purchased by female) male | P_DIY == 399 (total DIY book purchased by male)

Female | P_Art == 234 (total Art book purchased by female) male | P_Art == 446 (total Art book purchased by male)

#6. Correlation

chart.Correlation(BBBC_Train, histogram=FALSE, pch=19)

First_purchase and Last_Purchase are highly corelated with 0.81

#Linear Regression and assumptions

par(mfrow=c(2,2))  # set 2 rows and 2 column plot layout
mod_1 <- lm(Choice ~., data=BBBC_Train)  # linear model
plot(mod_1)

summary(mod_1)

## 
## Call:
## lm(formula = Choice ~ ., data = BBBC_Train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9603 -0.2462 -0.1161  0.1622  1.0588 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.3642284  0.0307411  11.848  < 2e-16 ***
## Gender           -0.1309205  0.0200303  -6.536 8.48e-11 ***
## Amount_purchased  0.0002736  0.0001110   2.464   0.0138 *  
## Frequency        -0.0090868  0.0021791  -4.170 3.21e-05 ***
## Last_purchase     0.0970286  0.0135589   7.156 1.26e-12 ***
## First_purchase   -0.0020024  0.0018160  -1.103   0.2704    
## P_Child          -0.1262584  0.0164011  -7.698 2.41e-14 ***
## P_Youth          -0.0963563  0.0201097  -4.792 1.81e-06 ***
## P_Cook           -0.1414907  0.0166064  -8.520  < 2e-16 ***
## P_DIY            -0.1352313  0.0197873  -6.834 1.17e-11 ***
## P_Art             0.1178494  0.0194427   6.061 1.68e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3788 on 1589 degrees of freedom
## Multiple R-squared:  0.2401, Adjusted R-squared:  0.2353 
## F-statistic:  50.2 on 10 and 1589 DF,  p-value: < 2.2e-16

#VIF using Linear Model

car::vif(mod_1)

##           Gender Amount_purchased        Frequency    Last_purchase 
##         1.005801         1.248066         3.253860        18.770402 
##   First_purchase          P_Child          P_Youth           P_Cook 
##         9.685333         3.360349         1.775022         3.324928 
##            P_DIY            P_Art 
##         2.016910         2.273771

#Base Logit Model with predictor

glm.fit1 <- glm(Choice~., data =BBBC.T1, family = 'binomial')

#Excluding Last_Purchase and First_Purchase

glm.fit2 <- glm(Choice~ Gender+Amount_purchased+Frequency+P_Child+P_Youth+P_Cook+P_DIY+P_Art, data = BBBC.T1, family= 'binomial')

#Checking Collinearity with Logit Base model

car::vif(glm.fit1)

##           Gender Amount_purchased        Frequency    Last_purchase 
##         1.023359         1.232172         2.490447        17.706670 
##   First_purchase          P_Child          P_Youth           P_Cook 
##         9.247748         2.992269         1.761546         3.229097 
##            P_DIY            P_Art 
##         1.992698         1.938089

Last_purchase and First_Purchase collinearity is high

#Checking Collinearity Excluding Last_Purchase and First_Purchase

car::vif(glm.fit2)

##           Gender Amount_purchased        Frequency          P_Child 
##         1.020217         1.213528         1.015899         1.215500 
##          P_Youth           P_Cook            P_DIY            P_Art 
##         1.081019         1.228798         1.179821         1.229491

Validate linearity assumption for Logistic regression

# Predict the probability (p) of logistic regression fit ran above
probabilities <- predict(glm.fit2, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
head(predicted.classes)

##     1     2     3     4     5     6 
## "neg" "pos" "neg" "neg" "pos" "neg"

# Select only numeric predictors
mydata2 <- BBBC.T1 %>%
  dplyr::select_if(is.numeric) 

predictors <- colnames(mydata2)
# Bind the logit and tidying the data for plot
mydata2 <- mydata2 %>%
  mutate(logit = log(probabilities/(1-probabilities))) %>%
  gather(key = "predictors", value = "predictor.value", -logit)

# Create the scatter plots
ggplot(mydata2, aes(logit, predictor.value))+
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(method = "loess") + 
  theme_bw() + 
  facet_wrap(~predictors, scales = "free_y")

plot(glm.fit2, which=4, id.n=3)

#GLM Model with interaction terms

set.seed(123)
train.control <- trainControl(method = "cv", number = 10)
# Train the model
glm.fit3 <- train(Choice~ Gender+Amount_purchased+Frequency+Last_purchase+P_Child+P_Youth+P_Cook+P_DIY+P_Art+
                    Gender*P_Child+Gender*P_Youth+Gender*P_Cook+Gender*P_DIY+Gender*P_Art, 
                    data = BBBC.T1, 
                    method = "glm",
                    family = binomial, 
                    trControl = train.control)

summary(glm.fit3)

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.49064  -0.67179  -0.43069  -0.01095   2.79158  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -0.292559   0.221113  -1.323 0.185796    
## Gender1           -0.863087   0.194887  -4.429 9.48e-06 ***
## Amount_purchased   0.001876   0.000794   2.362 0.018164 *  
## Frequency         -0.090221   0.010634  -8.484  < 2e-16 ***
## Last_purchase      0.549378   0.079275   6.930 4.21e-12 ***
## P_Child           -0.806151   0.141345  -5.703 1.17e-08 ***
## P_Youth           -0.752065   0.193530  -3.886 0.000102 ***
## P_Cook            -0.876094   0.139047  -6.301 2.96e-10 ***
## P_DIY             -0.704329   0.179889  -3.915 9.03e-05 ***
## P_Art              0.492890   0.174077   2.831 0.004634 ** 
## `Gender1:P_Child` -0.016242   0.146983  -0.111 0.912010    
## `Gender1:P_Youth`  0.189251   0.228454   0.828 0.407445    
## `Gender1:P_Cook`  -0.085407   0.146986  -0.581 0.561204    
## `Gender1:P_DIY`   -0.398897   0.223753  -1.783 0.074626 .  
## `Gender1:P_Art`    0.286821   0.206231   1.391 0.164292    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1799.5  on 1599  degrees of freedom
## Residual deviance: 1388.2  on 1585  degrees of freedom
## AIC: 1418.2
## 
## Number of Fisher Scoring iterations: 5

None of the interaction term is significant

pred.glm <- predict(glm.fit3, BBBC_Test)
confusionMatrix(pred.glm, BBBC_Test$Choice)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1971  127
##          1  125   77
##                                          
##                Accuracy : 0.8904         
##                  95% CI : (0.877, 0.9029)
##     No Information Rate : 0.9113         
##     P-Value [Acc > NIR] : 0.9997         
##                                          
##                   Kappa : 0.3192         
##                                          
##  Mcnemar's Test P-Value : 0.9498         
##                                          
##             Sensitivity : 0.9404         
##             Specificity : 0.3775         
##          Pos Pred Value : 0.9395         
##          Neg Pred Value : 0.3812         
##              Prevalence : 0.9113         
##          Detection Rate : 0.8570         
##    Detection Prevalence : 0.9122         
##       Balanced Accuracy : 0.6589         
##                                          
##        'Positive' Class : 0              
##

#GLM Model Exclduing First_Purchase

set.seed(123)
train.control <- trainControl(method = "cv", number = 10)
# Train the model
glm.fit4 <- train(Choice~ Gender+Amount_purchased+Frequency+Last_purchase+P_Child+P_Youth+P_Cook+P_DIY+P_Art, data = BBBC.T1, method = "glm",family = binomial, trControl = train.control)

pred.glm <- predict(glm.fit4, BBBC_Test)
confusionMatrix(pred.glm, BBBC_Test$Choice)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1973  127
##          1  123   77
##                                           
##                Accuracy : 0.8913          
##                  95% CI : (0.8779, 0.9037)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 0.9995          
##                                           
##                   Kappa : 0.3216          
##                                           
##  Mcnemar's Test P-Value : 0.8495          
##                                           
##             Sensitivity : 0.9413          
##             Specificity : 0.3775          
##          Pos Pred Value : 0.9395          
##          Neg Pred Value : 0.3850          
##              Prevalence : 0.9113          
##          Detection Rate : 0.8578          
##    Detection Prevalence : 0.9130          
##       Balanced Accuracy : 0.6594          
##                                           
##        'Positive' Class : 0               
##

#SVM Models

set.seed(123)
model1 <- Choice ~ as.factor(Gender) + Amount_purchased  +
  Frequency + Last_purchase  + P_Child + P_Youth + P_Cook + P_DIY + P_Art

tune <- tune.svm(model1, data = BBBC.T1, gamma =seq(.01, 0.1, by = .01), cost = seq(0.1,1, by = 0.1))
tune$best.parameters

##    gamma cost
## 90   0.1  0.9

tune$performances

##     gamma cost    error dispersion
## 1    0.01  0.1 0.250000 0.03750000
## 2    0.02  0.1 0.250000 0.03750000
## 3    0.03  0.1 0.245000 0.03343734
## 4    0.04  0.1 0.240000 0.03513367
## 5    0.05  0.1 0.236875 0.03649700
## 6    0.06  0.1 0.235000 0.04084609
## 7    0.07  0.1 0.235000 0.04052605
## 8    0.08  0.1 0.233750 0.04105806
## 9    0.09  0.1 0.234375 0.04169270
## 10   0.10  0.1 0.235625 0.04187966
## 11   0.01  0.2 0.248125 0.03461720
## 12   0.02  0.2 0.223125 0.03842042
## 13   0.03  0.2 0.220000 0.03793727
## 14   0.04  0.2 0.216250 0.03622844
## 15   0.05  0.2 0.216875 0.03609041
## 16   0.06  0.2 0.218125 0.03755205
## 17   0.07  0.2 0.220625 0.03819381
## 18   0.08  0.2 0.220000 0.03712778
## 19   0.09  0.2 0.220000 0.03701070
## 20   0.10  0.2 0.220625 0.03964359
## 21   0.01  0.3 0.229375 0.03715700
## 22   0.02  0.3 0.213125 0.03625838
## 23   0.03  0.3 0.215000 0.03574602
## 24   0.04  0.3 0.217500 0.03382451
## 25   0.05  0.3 0.218750 0.03333333
## 26   0.06  0.3 0.218750 0.03535534
## 27   0.07  0.3 0.220000 0.03593976
## 28   0.08  0.3 0.221875 0.03611445
## 29   0.09  0.3 0.221250 0.03670453
## 30   0.10  0.3 0.220625 0.03762133
## 31   0.01  0.4 0.220000 0.03861761
## 32   0.02  0.4 0.215000 0.03763863
## 33   0.03  0.4 0.216875 0.03633013
## 34   0.04  0.4 0.216250 0.03574602
## 35   0.05  0.4 0.217500 0.03533078
## 36   0.06  0.4 0.217500 0.03606033
## 37   0.07  0.4 0.218750 0.03510896
## 38   0.08  0.4 0.218125 0.03637789
## 39   0.09  0.4 0.219375 0.03685204
## 40   0.10  0.4 0.220625 0.03633013
## 41   0.01  0.5 0.213750 0.03939649
## 42   0.02  0.5 0.215625 0.03764440
## 43   0.03  0.5 0.215000 0.03634805
## 44   0.04  0.5 0.215625 0.03623443
## 45   0.05  0.5 0.215625 0.03489195
## 46   0.06  0.5 0.216875 0.03333984
## 47   0.07  0.5 0.216250 0.03322900
## 48   0.08  0.5 0.217500 0.03382451
## 49   0.09  0.5 0.218125 0.03403557
## 50   0.10  0.5 0.220625 0.03680490
## 51   0.01  0.6 0.212500 0.03761556
## 52   0.02  0.6 0.215000 0.03705758
## 53   0.03  0.6 0.215000 0.03821086
## 54   0.04  0.6 0.216250 0.03425801
## 55   0.05  0.6 0.216875 0.03359920
## 56   0.06  0.6 0.216875 0.03385657
## 57   0.07  0.6 0.217500 0.03224795
## 58   0.08  0.6 0.217500 0.03197764
## 59   0.09  0.6 0.218125 0.03313090
## 60   0.10  0.6 0.218125 0.03685204
## 61   0.01  0.7 0.213125 0.03755205
## 62   0.02  0.7 0.214375 0.03864569
## 63   0.03  0.7 0.215625 0.03922535
## 64   0.04  0.7 0.216875 0.03536148
## 65   0.05  0.7 0.217500 0.03408018
## 66   0.06  0.7 0.217500 0.03408018
## 67   0.07  0.7 0.216875 0.03333984
## 68   0.08  0.7 0.217500 0.03073181
## 69   0.09  0.7 0.216250 0.03413108
## 70   0.10  0.7 0.216250 0.03574602
## 71   0.01  0.8 0.214375 0.03819381
## 72   0.02  0.8 0.213750 0.03872983
## 73   0.03  0.8 0.213750 0.03917553
## 74   0.04  0.8 0.216875 0.03536148
## 75   0.05  0.8 0.216875 0.03461720
## 76   0.06  0.8 0.216875 0.03411200
## 77   0.07  0.8 0.217500 0.03197764
## 78   0.08  0.8 0.217500 0.03278190
## 79   0.09  0.8 0.215000 0.03335936
## 80   0.10  0.8 0.210000 0.03361857
## 81   0.01  0.9 0.213125 0.03957785
## 82   0.02  0.9 0.214375 0.03975293
## 83   0.03  0.9 0.215000 0.03786856
## 84   0.04  0.9 0.216875 0.03536148
## 85   0.05  0.9 0.216250 0.03451047
## 86   0.06  0.9 0.216875 0.03333984
## 87   0.07  0.9 0.216875 0.03385657
## 88   0.08  0.9 0.215625 0.03362503
## 89   0.09  0.9 0.211250 0.03304563
## 90   0.10  0.9 0.208750 0.03216710
## 91   0.01  1.0 0.213750 0.03961621
## 92   0.02  1.0 0.213750 0.03917553
## 93   0.03  1.0 0.215625 0.03647321
## 94   0.04  1.0 0.216875 0.03536148
## 95   0.05  1.0 0.215625 0.03451675
## 96   0.06  1.0 0.218125 0.03273552
## 97   0.07  1.0 0.215625 0.03451675
## 98   0.08  1.0 0.213750 0.03557562
## 99   0.09  1.0 0.209375 0.03513985
## 100  0.10  1.0 0.211250 0.03030516

lowest error = 0.210625 can be found at gamma = 0.09 and cost = 0.9

mysvm <- svm(formula=model1, data = BBBC.T1, kernel = "polynomial", gamma = tune$best.parameters$gamma, cost = tune$best.parameters$cost)
summary(mysvm)

## 
## Call:
## svm(formula = model1, data = BBBC.T1, kernel = "polynomial", gamma = tune$best.parameters$gamma, 
##     cost = tune$best.parameters$cost)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  0.9 
##      degree:  3 
##      coef.0:  0 
## 
## Number of Support Vectors:  752
## 
##  ( 356 396 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

svmpredict <- predict(mysvm,BBBC_Test, type = "response")

# Cross tabs or confusion matrix

table(pred=svmpredict,true=BBBC_Test$Choice)

##     true
## pred    0    1
##    0 2056  167
##    1   40   37

(2056+37)/(2300)

## [1] 0.91

BBBC_Test$svmpredict <- predict(mysvm,BBBC_Test, type = "response")

write.csv(BBBC_Test,file="finalModel.csv")

#Model Comparison

Model	Accuracy
glm.fit3	0.8904
glm.fit4	0.8913
MYSVM (Poly)	0.91

Bookbinder Case Study

Group - Ali, Chilakapati, Christman

9/12/2020

Validate linearity assumption for Logistic regression