Bookbinders Case Study

Splitting Data

bbbc_test <- read_excel("BBBC-Test.xlsx")

bbbc_test <- (subset(bbbc_test, select=-c(Observation)))

bbbc_test$Gender[bbbc_test$Gender==1]="Male";

bbbc_test$Gender[bbbc_test$Gender==0]="Female";




bbbc_train <- read_excel("BBBC-Train.xlsx")

bbbc_train <- (subset(bbbc_train, select=-c(Observation)))

bbbc_train$Gender[bbbc_train$Gender==1]="Male";

bbbc_train$Gender[bbbc_train$Gender==0]="Female";




anyNA(bbbc_train)

## [1] FALSE

anyNA(bbbc_test)

## [1] FALSE

str(bbbc_train)

## tibble [1,600 x 11] (S3: tbl_df/tbl/data.frame)
##  $ Choice          : num [1:1600] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Gender          : chr [1:1600] "Male" "Male" "Male" "Male" ...
##  $ Amount_purchased: num [1:1600] 113 418 336 180 320 268 198 280 393 138 ...
##  $ Frequency       : num [1:1600] 8 6 18 16 2 4 2 6 12 10 ...
##  $ Last_purchase   : num [1:1600] 1 11 6 5 3 1 12 2 11 7 ...
##  $ First_purchase  : num [1:1600] 8 66 32 42 18 4 62 12 50 38 ...
##  $ P_Child         : num [1:1600] 0 0 2 2 0 0 2 0 3 2 ...
##  $ P_Youth         : num [1:1600] 1 2 0 0 0 0 3 2 0 3 ...
##  $ P_Cook          : num [1:1600] 0 3 1 0 0 0 2 0 3 0 ...
##  $ P_DIY           : num [1:1600] 0 2 1 1 1 0 1 0 0 0 ...
##  $ P_Art           : num [1:1600] 0 3 2 1 2 0 2 0 2 1 ...

summary(bbbc_train$Choice)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    0.00    0.25    0.25    1.00

table(bbbc_train$Choice)

## 
##    0    1 
## 1200  400

Exploratory Analysis Note high correlation

#pairs(bbbc_train)
pairs(subset(bbbc_train, select=-c(Gender)))

plot(bbbc_train)

#cor(subset(bbbc_train, select=-c(Gender)))

Multicollinearity (taking out Last Purchased due to high VIF/correlation)

bbbc_logit <- glm(Choice~., data = bbbc_train, family = binomial)

vif(bbbc_logit)

##           Gender Amount_purchased        Frequency    Last_purchase 
##         1.023359         1.232172         2.490447        17.706670 
##   First_purchase          P_Child          P_Youth           P_Cook 
##         9.247748         2.992269         1.761546         3.229097 
##            P_DIY            P_Art 
##         1.992698         1.938089

bbbc_train = (subset(bbbc_train, select=-c(Last_purchase)))

bbbc_test = (subset(bbbc_test, select=-c(Last_purchase)))


bbbc_logit <- glm(Choice~., data = bbbc_train, family = binomial)

vif(bbbc_logit)

##           Gender Amount_purchased        Frequency   First_purchase 
##         1.021977         1.220305         2.173240         6.886806 
##          P_Child          P_Youth           P_Cook            P_DIY 
##         1.904631         1.320305         2.060140         1.462770 
##            P_Art 
##         1.603865

bbbc_train = (subset(bbbc_train, select=-c(First_purchase)))

bbbc_test = (subset(bbbc_test, select=-c(First_purchase)))


bbbc_logit <- glm(Choice~., data = bbbc_train, family = binomial)

vif(bbbc_logit)

##           Gender Amount_purchased        Frequency          P_Child 
##         1.020217         1.213528         1.015899         1.215500 
##          P_Youth           P_Cook            P_DIY            P_Art 
##         1.081019         1.228798         1.179821         1.229491

Linear Regression

note that this method gives continuous values and not probabilities and therefore is not suited for classification assume probabilities

bbbc_lm <- lm(Choice~., data = bbbc_train)

summary(bbbc_lm)

## 
## Call:
## lm(formula = Choice ~ ., data = bbbc_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9501 -0.2518 -0.1273  0.1509  1.1211 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.3731865  0.0302933  12.319  < 2e-16 ***
## GenderMale       -0.1263728  0.0203683  -6.204 6.99e-10 ***
## Amount_purchased  0.0003688  0.0001123   3.283  0.00105 ** 
## Frequency        -0.0112345  0.0012344  -9.101  < 2e-16 ***
## P_Child          -0.0275983  0.0100284  -2.752  0.00599 ** 
## P_Youth          -0.0014841  0.0159946  -0.093  0.92609    
## P_Cook           -0.0428346  0.0102155  -4.193 2.90e-05 ***
## P_DIY            -0.0384262  0.0153017  -2.511  0.01213 *  
## P_Art             0.2183323  0.0140081  15.586  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3856 on 1591 degrees of freedom
## Multiple R-squared:  0.2114, Adjusted R-squared:  0.2075 
## F-statistic: 53.32 on 8 and 1591 DF,  p-value: < 2.2e-16

par(mfrow=c(2,2))
plot(bbbc_lm)

predict = predict(bbbc_lm, bbbc_test)
predict_choice = ifelse(predict>0.3, 1,0)



plot(bbbc_train$Amount_purchased, bbbc_train$Choice)
abline(lm(bbbc_train$Choice ~ bbbc_train$Amount_purchased))

caret::confusionMatrix(as.factor(predict_choice),as.factor(bbbc_test$Choice), positive = '1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1621   71
##          1  475  133
##                                           
##                Accuracy : 0.7626          
##                  95% CI : (0.7447, 0.7799)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2246          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.65196         
##             Specificity : 0.77338         
##          Pos Pred Value : 0.21875         
##          Neg Pred Value : 0.95804         
##              Prevalence : 0.08870         
##          Detection Rate : 0.05783         
##    Detection Prevalence : 0.26435         
##       Balanced Accuracy : 0.71267         
##                                           
##        'Positive' Class : 1               
##

make Choice a factor for Logistic Regression and SVM

bbbc_test[["Choice"]] = factor(bbbc_test[["Choice"]])

bbbc_train[["Choice"]] = factor(bbbc_train[["Choice"]])

Logistic Regression with all predictors

bbbc_logit <- glm(Choice~., data = bbbc_train, family = binomial)

#cooks distance plot

dev <-residuals(bbbc_logit, type = "deviance")
pea <-residuals(bbbc_logit, type = "pearson")
devstd<-residuals(bbbc_logit, type = "deviance")/sqrt(1 - hatvalues(bbbc_logit)) # standardized deviance residuals
peastd <-residuals(bbbc_logit, type = "pearson")/sqrt(1 - hatvalues(bbbc_logit)) # standardized pearson residuals

dev.new(width = 1000, height = 1000, unit = "px")
par(mfrow=c(1,2))

plot(devstd[bbbc_logit$model$Choice==1], col = "red", 
     ylim = c(-3.5,3.5), ylab = "std. deviance residuals", xlab = "ID")
points(devstd[bbbc_logit$model$Choice==0], col = "blue")

plot(peastd[bbbc_logit$model$Choice==1], col = "red", 
     ylim = c(-3.5,3.5), ylab = "std. Pearson residuals", xlab = "ID")
points(peastd[bbbc_logit$model$Choice==0], col = "blue")

#model selection



#taking out outliers

cooks_sd_cutoff = 3*sd(cooks.distance(bbbc_logit))
out <- which(cooks.distance(bbbc_logit)>cooks_sd_cutoff)

bbbc_logit <- glm(Choice~., data = bbbc_train[-out,], family = binomial)

summary(bbbc_logit)

## 
## Call:
## glm(formula = Choice ~ ., family = binomial, data = bbbc_train[-out, 
##     ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0699  -0.6284  -0.3964  -0.1077   2.4597  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.047670   0.220341  -0.216 0.828718    
## GenderMale       -0.880793   0.145243  -6.064 1.33e-09 ***
## Amount_purchased  0.002656   0.000833   3.188 0.001431 ** 
## Frequency        -0.117777   0.012245  -9.619  < 2e-16 ***
## P_Child          -0.281328   0.084375  -3.334 0.000855 ***
## P_Youth          -0.234805   0.127903  -1.836 0.066385 .  
## P_Cook           -0.419486   0.085446  -4.909 9.14e-07 ***
## P_DIY            -0.319053   0.121319  -2.630 0.008542 ** 
## P_Art             1.493295   0.110991  13.454  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1696.9  on 1558  degrees of freedom
## Residual deviance: 1263.8  on 1550  degrees of freedom
## AIC: 1281.8
## 
## Number of Fisher Scoring iterations: 5

#Maximized cutoff (change predict object)
pred <- prediction(predict(bbbc_logit, bbbc_test, type = "response"),
                   bbbc_test$Choice) 

#Predicted Probability and True Classification

auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)

false.rates <-performance(pred, "fpr","fnr")
accuracy <-performance(pred, "acc","err")

plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values), 
     type="l", lwd=2, 
     ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values), 
     type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')

min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x

abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)

#ROC plot (don't need to change)
perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))

#Confusion Matrix (change predict object)
predict = predict(bbbc_logit, bbbc_test, type = "response")
predict_choice = ifelse(predict>optimal, 1,0)

caret::confusionMatrix(as.factor(predict_choice),as.factor(bbbc_test$Choice), positive = '1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1501   58
##          1  595  146
##                                           
##                Accuracy : 0.7161          
##                  95% CI : (0.6972, 0.7344)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1973          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.71569         
##             Specificity : 0.71613         
##          Pos Pred Value : 0.19703         
##          Neg Pred Value : 0.96280         
##              Prevalence : 0.08870         
##          Detection Rate : 0.06348         
##    Detection Prevalence : 0.32217         
##       Balanced Accuracy : 0.71591         
##                                           
##        'Positive' Class : 1               
##

#diagnostics
cooks_distance = cooks.distance(bbbc_logit)
plot(cooks_distance,col="pink", pch=19, cex=1) 
text(cooks.distance(bbbc_logit),labels = rownames(bbbc_train))

#hoslem.test(bbbc_logit$Choice, fitted(bbbc_logit), g=10)

exp(coef(bbbc_logit))

##      (Intercept)       GenderMale Amount_purchased        Frequency 
##        0.9534484        0.4144543        1.0026594        0.8888940 
##          P_Child          P_Youth           P_Cook            P_DIY 
##        0.7547806        0.7907248        0.6573845        0.7268366 
##            P_Art 
##        4.4517406

LR = Anova(bbbc_logit, test = "LR")
Wald = Anova(bbbc_logit, test = "Wald")
score = anova(bbbc_logit, test = "Rao")
data.frame(LR = LR[,1], Score = score$Rao[2], Wald = Wald$Chisq)

##           LR   Score       Wald
## 1  37.102971 32.5259  36.775120
## 2  10.303922 32.5259  10.165776
## 3 122.455019 32.5259  92.521765
## 4  11.893374 32.5259  11.117279
## 5   3.472190 32.5259   3.370205
## 6  26.822816 32.5259  24.101850
## 7   7.218459 32.5259   6.916238
## 8 233.897272 32.5259 181.015591

data.frame(p.LR = LR$`Pr(>Chisq)`, p.Score = score$`Pr(>Chi)`[2], p.Wald = Wald$`Pr(>Chisq)`)

##           p.LR      p.Score       p.Wald
## 1 1.120529e-09 1.176148e-08 1.325714e-09
## 2 1.327478e-03 1.176148e-08 1.430717e-03
## 3 1.835293e-28 1.176148e-08 6.658840e-22
## 4 5.633399e-04 1.176148e-08 8.552718e-04
## 5 6.240862e-02 1.176148e-08 6.638548e-02
## 6 2.229890e-07 1.176148e-08 9.137245e-07
## 7 7.215762e-03 1.176148e-08 8.541649e-03
## 8 8.422534e-53 1.176148e-08 2.908566e-41

Logistic Regression taking out P_Youth (BEST LOGIT MODEL)

bbbc_logit <- glm(Choice~.-P_Youth, data = bbbc_train, family = binomial)

#cooks distance plot

dev <-residuals(bbbc_logit, type = "deviance")
pea <-residuals(bbbc_logit, type = "pearson")
devstd<-residuals(bbbc_logit, type = "deviance")/sqrt(1 - hatvalues(bbbc_logit)) # standardized deviance residuals
peastd <-residuals(bbbc_logit, type = "pearson")/sqrt(1 - hatvalues(bbbc_logit)) # standardized pearson residuals

dev.new(width = 1000, height = 1000, unit = "px")
par(mfrow=c(1,2))

plot(devstd[bbbc_logit$model$Choice==1], col = "red", 
     ylim = c(-3.5,3.5), ylab = "std. deviance residuals", xlab = "ID")
points(devstd[bbbc_logit$model$Choice==0], col = "blue")

plot(peastd[bbbc_logit$model$Choice==1], col = "red", 
     ylim = c(-3.5,3.5), ylab = "std. Pearson residuals", xlab = "ID")
points(peastd[bbbc_logit$model$Choice==0], col = "blue")

#model selection



#taking out outliers

cooks_sd_cutoff = 3*sd(cooks.distance(bbbc_logit))
out <- which(cooks.distance(bbbc_logit)>cooks_sd_cutoff)

bbbc_logit <- glm(Choice~.-P_Youth, data = bbbc_train[-out,], family = binomial)

summary(bbbc_logit)

## 
## Call:
## glm(formula = Choice ~ . - P_Youth, family = binomial, data = bbbc_train[-out, 
##     ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0258  -0.6400  -0.4045  -0.1091   2.4655  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.0595999  0.2181315  -0.273 0.784677    
## GenderMale       -0.8572699  0.1442686  -5.942 2.81e-09 ***
## Amount_purchased  0.0024290  0.0008208   2.959 0.003084 ** 
## Frequency        -0.1174284  0.0121335  -9.678  < 2e-16 ***
## P_Child          -0.2865928  0.0834734  -3.433 0.000596 ***
## P_Cook           -0.4020912  0.0837769  -4.800 1.59e-06 ***
## P_DIY            -0.3378003  0.1195938  -2.825 0.004734 ** 
## P_Art             1.4704568  0.1098880  13.381  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1707.9  on 1561  degrees of freedom
## Residual deviance: 1280.2  on 1554  degrees of freedom
## AIC: 1296.2
## 
## Number of Fisher Scoring iterations: 5

#Maximized cutoff (change predict object)
pred <- prediction(predict(bbbc_logit, bbbc_test, type = "response"),
                   bbbc_test$Choice) 

#Predicted Probability and True Classification

auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)

false.rates <-performance(pred, "fpr","fnr")
accuracy <-performance(pred, "acc","err")

plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values), 
     type="l", lwd=2, 
     ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values), 
     type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')

min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x

abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)

#ROC plot (don't need to change)
perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))

#Confusion Matrix (change predict object)
predict = predict(bbbc_logit, bbbc_test, type = "response")
predict_choice = ifelse(predict>optimal, 1,0)

caret::confusionMatrix(as.factor(predict_choice),as.factor(bbbc_test$Choice), positive = '1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1511   57
##          1  585  147
##                                          
##                Accuracy : 0.7209         
##                  95% CI : (0.702, 0.7391)
##     No Information Rate : 0.9113         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.2036         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.72059        
##             Specificity : 0.72090        
##          Pos Pred Value : 0.20082        
##          Neg Pred Value : 0.96365        
##              Prevalence : 0.08870        
##          Detection Rate : 0.06391        
##    Detection Prevalence : 0.31826        
##       Balanced Accuracy : 0.72074        
##                                          
##        'Positive' Class : 1              
##

#diagnostics
cooks_distance = cooks.distance(bbbc_logit)
plot(cooks_distance,col="pink", pch=19, cex=1) 
text(cooks.distance(bbbc_logit),labels = rownames(bbbc_train))

#hoslem.test(bbbc_logit$Choice, fitted(bbbc_logit), g=10)

exp(coef(bbbc_logit))

##      (Intercept)       GenderMale Amount_purchased        Frequency 
##        0.9421414        0.4243189        1.0024319        0.8892042 
##          P_Child           P_Cook            P_DIY            P_Art 
##        0.7508174        0.6689197        0.7133377        4.3512224

LR = Anova(bbbc_logit, test = "LR")
Wald = Anova(bbbc_logit, test = "Wald")
score = anova(bbbc_logit, test = "Rao")
data.frame(LR = LR[,1], Score = score$Rao[2], Wald = Wald$Chisq)

##           LR    Score       Wald
## 1  35.568898 31.07381  35.309507
## 2   8.850279 31.07381   8.757069
## 3 123.870373 31.07381  93.663831
## 4  12.612136 31.07381  11.787838
## 5  25.581283 31.07381  23.035677
## 6   8.344074 31.07381   7.978166
## 7 230.574526 31.07381 179.062355

data.frame(p.LR = LR$`Pr(>Chisq)`, p.Score = score$`Pr(>Chi)`[2], p.Wald = Wald$`Pr(>Chisq)`)

##           p.LR      p.Score       p.Wald
## 1 2.461871e-09 2.483999e-08 2.812558e-09
## 2 2.930440e-03 2.483999e-08 3.084043e-03
## 3 8.993083e-29 2.483999e-08 3.739320e-22
## 4 3.832502e-04 2.483999e-08 5.961892e-04
## 5 4.241336e-07 2.483999e-08 1.590228e-06
## 6 3.869474e-03 2.483999e-08 4.734489e-03
## 7 4.467337e-52 2.483999e-08 7.765136e-41

Coefficients (subtract 1 from coefficient and multiply by 100 to get percentage change per unit increase)

exp(coef(bbbc_logit))

##      (Intercept)       GenderMale Amount_purchased        Frequency 
##        0.9421414        0.4243189        1.0024319        0.8892042 
##          P_Child           P_Cook            P_DIY            P_Art 
##        0.7508174        0.6689197        0.7133377        4.3512224

Logistic Regression AIC criteria (takes out P_youth)

bbbc_logit <- glm(Choice~., data = bbbc_train, family = binomial)

#cooks distance plot

dev <-residuals(bbbc_logit, type = "deviance")
pea <-residuals(bbbc_logit, type = "pearson")
devstd<-residuals(bbbc_logit, type = "deviance")/sqrt(1 - hatvalues(bbbc_logit)) # standardized deviance residuals
peastd <-residuals(bbbc_logit, type = "pearson")/sqrt(1 - hatvalues(bbbc_logit)) # standardized pearson residuals

dev.new(width = 1000, height = 1000, unit = "px")
par(mfrow=c(1,2))

plot(devstd[bbbc_logit$model$Choice==1], col = "red", 
     ylim = c(-3.5,3.5), ylab = "std. deviance residuals", xlab = "ID")
points(devstd[bbbc_logit$model$Choice==0], col = "blue")

plot(peastd[bbbc_logit$model$Choice==1], col = "red", 
     ylim = c(-3.5,3.5), ylab = "std. Pearson residuals", xlab = "ID")
points(peastd[bbbc_logit$model$Choice==0], col = "blue")

#model selection (AIC)
bbbc_logit = step(bbbc_logit, direction="both",test="Chisq", trace = F) 

summary(bbbc_logit)

## 
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     P_Child + P_Cook + P_DIY + P_Art, family = binomial, data = bbbc_train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.30792  -0.69156  -0.47311  -0.02466   2.84228  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.2894506  0.2026211  -1.429  0.15314    
## GenderMale       -0.8120440  0.1345723  -6.034  1.6e-09 ***
## Amount_purchased  0.0023859  0.0007678   3.108  0.00189 ** 
## Frequency        -0.0885491  0.0103772  -8.533  < 2e-16 ***
## P_Child          -0.1964495  0.0720116  -2.728  0.00637 ** 
## P_Cook           -0.2940503  0.0727520  -4.042  5.3e-05 ***
## P_DIY            -0.2823065  0.1076089  -2.623  0.00870 ** 
## P_Art             1.2441330  0.0988603  12.585  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1799.5  on 1599  degrees of freedom
## Residual deviance: 1445.1  on 1592  degrees of freedom
## AIC: 1461.1
## 
## Number of Fisher Scoring iterations: 5

#cooks_sd_cutoff = 3*sd(cooks.distance(AIC))
#out <- which(cooks.distance(AIC)>cooks_sd_cutoff)

#taking out outliers

cooks_sd_cutoff = 3*sd(cooks.distance(bbbc_logit))
out <- which(cooks.distance(bbbc_logit)>cooks_sd_cutoff)

bbbc_logit <- glm(Choice ~ Gender + Amount_purchased + Frequency + 
    P_Child + P_Cook + P_DIY + P_Art, data = bbbc_train[-out,], family = binomial)



#Maximized cutoff (change predict object)
pred <- prediction(predict(bbbc_logit, bbbc_test, type = "response"),
                   bbbc_test$Choice) 

#Predicted Probability and True Classification

auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)

false.rates <-performance(pred, "fpr","fnr")
accuracy <-performance(pred, "acc","err")

plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values), 
     type="l", lwd=2, 
     ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values), 
     type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')

min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x

abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)

#ROC plot (don't need to change)
perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))

#Confusion Matrix (change predict object)
predict = predict(bbbc_logit, bbbc_test, type = "response")
predict_choice = ifelse(predict>optimal, 1,0)

caret::confusionMatrix(as.factor(predict_choice),as.factor(bbbc_test$Choice), positive = '1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1511   57
##          1  585  147
##                                          
##                Accuracy : 0.7209         
##                  95% CI : (0.702, 0.7391)
##     No Information Rate : 0.9113         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.2036         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.72059        
##             Specificity : 0.72090        
##          Pos Pred Value : 0.20082        
##          Neg Pred Value : 0.96365        
##              Prevalence : 0.08870        
##          Detection Rate : 0.06391        
##    Detection Prevalence : 0.31826        
##       Balanced Accuracy : 0.72074        
##                                          
##        'Positive' Class : 1              
##

#diagnostics
cooks_distance = cooks.distance(bbbc_logit)
plot(cooks_distance,col="pink", pch=19, cex=1) 
text(cooks.distance(bbbc_logit),labels = rownames(bbbc_train))

#hoslem.test(bbbc_logit$Choice, fitted(bbbc_logit), g=10)

exp(coef(bbbc_logit))

##      (Intercept)       GenderMale Amount_purchased        Frequency 
##        0.9421414        0.4243189        1.0024319        0.8892042 
##          P_Child           P_Cook            P_DIY            P_Art 
##        0.7508174        0.6689197        0.7133377        4.3512224

LR = Anova(bbbc_logit, test = "LR")
Wald = Anova(bbbc_logit, test = "Wald")
score = anova(bbbc_logit, test = "Rao")
data.frame(LR = LR[,1], Score = score$Rao[2], Wald = Wald$Chisq)

##           LR    Score       Wald
## 1  35.568898 31.07381  35.309507
## 2   8.850279 31.07381   8.757069
## 3 123.870373 31.07381  93.663831
## 4  12.612136 31.07381  11.787838
## 5  25.581283 31.07381  23.035677
## 6   8.344074 31.07381   7.978166
## 7 230.574526 31.07381 179.062355

data.frame(p.LR = LR$`Pr(>Chisq)`, p.Score = score$`Pr(>Chi)`[2], p.Wald = Wald$`Pr(>Chisq)`)

##           p.LR      p.Score       p.Wald
## 1 2.461871e-09 2.483999e-08 2.812558e-09
## 2 2.930440e-03 2.483999e-08 3.084043e-03
## 3 8.993083e-29 2.483999e-08 3.739320e-22
## 4 3.832502e-04 2.483999e-08 5.961892e-04
## 5 4.241336e-07 2.483999e-08 1.590228e-06
## 6 3.869474e-03 2.483999e-08 4.734489e-03
## 7 4.467337e-52 2.483999e-08 7.765136e-41

Logistic Regression with BIC criteria (takes out predictor P_youth)

bbbc_logit <- glm(Choice~., data = bbbc_train, family = binomial)

#cooks distance plot

dev <-residuals(bbbc_logit, type = "deviance")
pea <-residuals(bbbc_logit, type = "pearson")
devstd<-residuals(bbbc_logit, type = "deviance")/sqrt(1 - hatvalues(bbbc_logit)) # standardized deviance residuals
peastd <-residuals(bbbc_logit, type = "pearson")/sqrt(1 - hatvalues(bbbc_logit)) # standardized pearson residuals

dev.new(width = 1000, height = 1000, unit = "px")
par(mfrow=c(1,2))

plot(devstd[bbbc_logit$model$Choice==1], col = "red", 
     ylim = c(-3.5,3.5), ylab = "std. deviance residuals", xlab = "ID")
points(devstd[bbbc_logit$model$Choice==0], col = "blue")

plot(peastd[bbbc_logit$model$Choice==1], col = "red", 
     ylim = c(-3.5,3.5), ylab = "std. Pearson residuals", xlab = "ID")
points(peastd[bbbc_logit$model$Choice==0], col = "blue")

#model selection (BIC)
bbbc_logit = step(bbbc_logit, direction="both",test="Chisq", trace = F,k=log(nrow(bbbc_train))) 

summary(bbbc_logit)

## 
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     P_Child + P_Cook + P_Art, family = binomial, data = bbbc_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2775  -0.6899  -0.4718  -0.0161   2.8399  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.3080705  0.2017240  -1.527  0.12671    
## GenderMale       -0.8036007  0.1341843  -5.989 2.11e-09 ***
## Amount_purchased  0.0021863  0.0007609   2.873  0.00406 ** 
## Frequency        -0.0878272  0.0103437  -8.491  < 2e-16 ***
## P_Child          -0.2193596  0.0713916  -3.073  0.00212 ** 
## P_Cook           -0.3273889  0.0719981  -4.547 5.44e-06 ***
## P_Art             1.2061765  0.0970177  12.433  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1799.5  on 1599  degrees of freedom
## Residual deviance: 1452.2  on 1593  degrees of freedom
## AIC: 1466.2
## 
## Number of Fisher Scoring iterations: 5

#identifying outliers

cooks_sd_cutoff = 3*sd(cooks.distance(bbbc_logit))
out <- which(cooks.distance(bbbc_logit)>cooks_sd_cutoff)

#fit BIC model without outliers

bbbc_logit <- glm(Choice ~ Gender + Amount_purchased + Frequency + 
    P_Child + P_Cook + P_Art, data = bbbc_train[-out,], family = binomial)



#Maximized cutoff (change predict object)
pred <- prediction(predict(bbbc_logit, bbbc_test, type = "response"),
                   bbbc_test$Choice) 

#Predicted Probability and True Classification

auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)

false.rates <-performance(pred, "fpr","fnr")
accuracy <-performance(pred, "acc","err")

plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values), 
     type="l", lwd=2, 
     ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values), 
     type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')

min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x

abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)

#ROC plot (don't need to change)
perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))

#Confusion Matrix (change predict object)
predict = predict(bbbc_logit, bbbc_test, type = "response")
predict_choice = ifelse(predict>optimal, 1,0)

caret::confusionMatrix(as.factor(predict_choice),as.factor(bbbc_test$Choice), positive = '1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1499   59
##          1  597  145
##                                           
##                Accuracy : 0.7148          
##                  95% CI : (0.6958, 0.7332)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1945          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.71078         
##             Specificity : 0.71517         
##          Pos Pred Value : 0.19542         
##          Neg Pred Value : 0.96213         
##              Prevalence : 0.08870         
##          Detection Rate : 0.06304         
##    Detection Prevalence : 0.32261         
##       Balanced Accuracy : 0.71298         
##                                           
##        'Positive' Class : 1               
##

#diagnostics
cooks_distance = cooks.distance(bbbc_logit)
plot(cooks_distance,col="pink", pch=19, cex=1) 
text(cooks.distance(bbbc_logit),labels = rownames(bbbc_train))

#hoslem.test(bbbc_logit$Choice, fitted(bbbc_logit), g=10)

exp(coef(bbbc_logit))

##      (Intercept)       GenderMale Amount_purchased        Frequency 
##        0.9537936        0.4249002        1.0020992        0.8896104 
##          P_Child           P_Cook            P_Art 
##        0.7377371        0.6418250        4.1014105

LR = Anova(bbbc_logit, test = "LR")
Wald = Anova(bbbc_logit, test = "Wald")
score = anova(bbbc_logit, test = "Rao")
data.frame(LR = LR[,1], Score = score$Rao[2], Wald = Wald$Chisq)

##           LR   Score       Wald
## 1  35.996590 32.0851  35.727003
## 2   6.765368 32.0851   6.712547
## 3 123.677827 32.0851  93.494151
## 4  14.980828 32.0851  13.877597
## 5  31.983008 32.0851  28.249544
## 6 220.492576 32.0851 173.461103

data.frame(p.LR = LR$`Pr(>Chisq)`, p.Score = score$`Pr(>Chi)`[2], p.Wald = Wald$`Pr(>Chisq)`)

##           p.LR      p.Score       p.Wald
## 1 1.976631e-09 1.475649e-08 2.269958e-09
## 2 9.294376e-03 1.475649e-08 9.573691e-03
## 3 9.909506e-29 1.475649e-08 4.074026e-22
## 4 1.086090e-04 1.475649e-08 1.951104e-04
## 5 1.555270e-08 1.475649e-08 1.066397e-07
## 6 7.062212e-50 1.475649e-08 1.297988e-39

SVM Radial Attempt with VIF less than 5 criteria (BEST SO FAR)

bbbc_train[["Choice"]] = factor(bbbc_train[["Choice"]])
#bbbc_tuned = tune.svm(Choice~., data = bbbc_train, gamma = seq(0.01, 0.1, by = 0.01), cost = seq(0.1, 1, by = 0.1))

#bbbc_tuned$best.parameters

mysvm = svm(formula = Choice~., data= bbbc_train, gamma = 0.1, cost = 10000)
summary(mysvm)

## 
## Call:
## svm(formula = Choice ~ ., data = bbbc_train, gamma = 0.1, cost = 10000)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  10000 
## 
## Number of Support Vectors:  712
## 
##  ( 296 416 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

svmpredict = predict(mysvm, bbbc_test, type = "response")
#table(pred = svmpredict, true = bbbc_test$Choice)

#table(pred = svmpredict, true = bbbc_test$Choice)
caret::confusionMatrix(as.factor(svmpredict),as.factor(bbbc_test$Choice), positive = '1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1817  135
##          1  279   69
##                                           
##                Accuracy : 0.82            
##                  95% CI : (0.8037, 0.8355)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1556          
##                                           
##  Mcnemar's Test P-Value : 2.094e-12       
##                                           
##             Sensitivity : 0.3382          
##             Specificity : 0.8669          
##          Pos Pred Value : 0.1983          
##          Neg Pred Value : 0.9308          
##              Prevalence : 0.0887          
##          Detection Rate : 0.0300          
##    Detection Prevalence : 0.1513          
##       Balanced Accuracy : 0.6026          
##                                           
##        'Positive' Class : 1               
##

#table(bbbc_test$Choice)
#table(svmpredict)

SVM Linear Attempt with VIF less than 5 criteria

g/c: 0.03 0.7 0.1 0.8

bbbc_train[["Choice"]] = factor(bbbc_train[["Choice"]])
#bbbc_tuned = tune.svm(Choice~., data = bbbc_train, gamma = seq(0.01, 0.1, by = 0.01), cost = seq(0.1, 1, by = 0.1))

#bbbc_tuned$best.parameters

mysvm = svm(formula = Choice~., data= bbbc_train, gamma = 0.1, cost = 100000, kernel = "linear")
summary(mysvm)

## 
## Call:
## svm(formula = Choice ~ ., data = bbbc_train, gamma = 0.1, cost = 1e+05, 
##     kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1e+05 
## 
## Number of Support Vectors:  1133
## 
##  ( 377 756 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

svmpredict = predict(mysvm, bbbc_test, type = "response")
#table(pred = svmpredict, true = bbbc_test$Choice)

#table(pred = svmpredict, true = bbbc_test$Choice)
caret::confusionMatrix(as.factor(svmpredict),as.factor(bbbc_test$Choice), positive = '1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1720  140
##          1  376   64
##                                          
##                Accuracy : 0.7757         
##                  95% CI : (0.758, 0.7926)
##     No Information Rate : 0.9113         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.0883         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.31373        
##             Specificity : 0.82061        
##          Pos Pred Value : 0.14545        
##          Neg Pred Value : 0.92473        
##              Prevalence : 0.08870        
##          Detection Rate : 0.02783        
##    Detection Prevalence : 0.19130        
##       Balanced Accuracy : 0.56717        
##                                          
##        'Positive' Class : 1              
##

#table(bbbc_test$Choice)
#table(svmpredict)

Radial but reintroduce variables we took out due to VIF

bbbc_test <- read_excel("BBBC-Test.xlsx")

bbbc_test <- (subset(bbbc_test, select=-c(Observation)))

bbbc_test$Gender[bbbc_test$Gender==1]="Male";

bbbc_test$Gender[bbbc_test$Gender==0]="Female";




bbbc_train <- read_excel("BBBC-Train.xlsx")

bbbc_train <- (subset(bbbc_train, select=-c(Observation)))

bbbc_train$Gender[bbbc_train$Gender==1]="Male";

bbbc_train$Gender[bbbc_train$Gender==0]="Female";




anyNA(bbbc_train)

## [1] FALSE

anyNA(bbbc_test)

## [1] FALSE

bbbc_test[["Choice"]] = factor(bbbc_test[["Choice"]])

bbbc_train[["Choice"]] = factor(bbbc_train[["Choice"]])


#bbbc_tuned = tune.svm(Choice~., data = bbbc_train, gamma = seq(0.01, 0.1, by = 0.01), cost = seq(0.1, 1, by = 0.1))

#bbbc_tuned$best.parameters

mysvm = svm(formula = Choice~., data= bbbc_train, gamma = 0.1, cost = 10000)
summary(mysvm)

## 
## Call:
## svm(formula = Choice ~ ., data = bbbc_train, gamma = 0.1, cost = 10000)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  10000 
## 
## Number of Support Vectors:  674
## 
##  ( 275 399 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

svmpredict = predict(mysvm, bbbc_test, type = "response")
#table(pred = svmpredict, true = bbbc_test$Choice)

#table(pred = svmpredict, true = bbbc_test$Choice)
caret::confusionMatrix(as.factor(svmpredict),as.factor(bbbc_test$Choice), positive = '1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1776  119
##          1  320   85
##                                          
##                Accuracy : 0.8091         
##                  95% CI : (0.7925, 0.825)
##     No Information Rate : 0.9113         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.1827         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.41667        
##             Specificity : 0.84733        
##          Pos Pred Value : 0.20988        
##          Neg Pred Value : 0.93720        
##              Prevalence : 0.08870        
##          Detection Rate : 0.03696        
##    Detection Prevalence : 0.17609        
##       Balanced Accuracy : 0.63200        
##                                          
##        'Positive' Class : 1              
##

#table(bbbc_test$Choice)
#table(svmpredict)

SVM Linear All Predictors

bbbc_test[["Choice"]] = factor(bbbc_test[["Choice"]])

bbbc_train[["Choice"]] = factor(bbbc_train[["Choice"]])


#bbbc_tuned = tune.svm(Choice~., data = bbbc_train, gamma = seq(0.01, 0.1, by = 0.01), cost = seq(0.1, 1, by = 0.1))

#bbbc_tuned$best.parameters

mysvm = svm(formula = Choice~., data= bbbc_train, gamma = 0.1, cost = 10000, kernel = "linear")
summary(mysvm)

## 
## Call:
## svm(formula = Choice ~ ., data = bbbc_train, gamma = 0.1, cost = 10000, 
##     kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  10000 
## 
## Number of Support Vectors:  777
## 
##  ( 369 408 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

svmpredict = predict(mysvm, bbbc_test, type = "response")
#table(pred = svmpredict, true = bbbc_test$Choice)

#table(pred = svmpredict, true = bbbc_test$Choice)
caret::confusionMatrix(as.factor(svmpredict),as.factor(bbbc_test$Choice), positive = '1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1988  148
##          1  108   56
##                                           
##                Accuracy : 0.8887          
##                  95% CI : (0.8751, 0.9013)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 0.99990         
##                                           
##                   Kappa : 0.2446          
##                                           
##  Mcnemar's Test P-Value : 0.01479         
##                                           
##             Sensitivity : 0.27451         
##             Specificity : 0.94847         
##          Pos Pred Value : 0.34146         
##          Neg Pred Value : 0.93071         
##              Prevalence : 0.08870         
##          Detection Rate : 0.02435         
##    Detection Prevalence : 0.07130         
##       Balanced Accuracy : 0.61149         
##                                           
##        'Positive' Class : 1               
##

polynomial svm (worst sensitivity)

bbbc_test[["Choice"]] = factor(bbbc_test[["Choice"]])

bbbc_train[["Choice"]] = factor(bbbc_train[["Choice"]])


#bbbc_tuned = tune.svm(Choice~., data = bbbc_train, gamma = seq(0.01, 0.1, by = 0.01), cost = seq(0.1, 1, by = 0.1))

#bbbc_tuned$best.parameters

mysvm = svm(formula = Choice~., data= bbbc_train, gamma = 0.1, cost = 10000, kernel = "polynomial")
summary(mysvm)

## 
## Call:
## svm(formula = Choice ~ ., data = bbbc_train, gamma = 0.1, cost = 10000, 
##     kernel = "polynomial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  10000 
##      degree:  3 
##      coef.0:  0 
## 
## Number of Support Vectors:  664
## 
##  ( 306 358 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

svmpredict = predict(mysvm, bbbc_test, type = "response")
#table(pred = svmpredict, true = bbbc_test$Choice)

#table(pred = svmpredict, true = bbbc_test$Choice)
caret::confusionMatrix(as.factor(svmpredict),as.factor(bbbc_test$Choice), positive = '1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1875  140
##          1  221   64
##                                           
##                Accuracy : 0.843           
##                  95% CI : (0.8275, 0.8577)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1766          
##                                           
##  Mcnemar's Test P-Value : 2.548e-05       
##                                           
##             Sensitivity : 0.31373         
##             Specificity : 0.89456         
##          Pos Pred Value : 0.22456         
##          Neg Pred Value : 0.93052         
##              Prevalence : 0.08870         
##          Detection Rate : 0.02783         
##    Detection Prevalence : 0.12391         
##       Balanced Accuracy : 0.60414         
##                                           
##        'Positive' Class : 1               
##

SVM Linear with class weights (no change)

bbbc_tuned = tune.svm(Choice~., data = bbbc_train, gamma = seq(0.01, 0.1, by = 0.01), cost = seq(0.1, 1, by = 0.1), class.weights= c("0" = 1, "1" = 10))

bbbc_tuned$best.parameters

##    gamma cost class.weights
## 61  0.01  0.7             1

mysvm = svm(formula = Choice~., data= bbbc_train, gamma = bbbc_tuned$best.parameters$gamma, cost = bbbc_tuned$best.parameters$cost, kernel = "linear")
summary(mysvm)

## 
## Call:
## svm(formula = Choice ~ ., data = bbbc_train, gamma = bbbc_tuned$best.parameters$gamma, 
##     cost = bbbc_tuned$best.parameters$cost, kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.7 
## 
## Number of Support Vectors:  739
## 
##  ( 367 372 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

svmpredict = predict(mysvm, bbbc_test, type = "response")
table(pred = svmpredict, true = bbbc_test$Choice)

##     true
## pred    0    1
##    0 2011  147
##    1   85   57

caret::confusionMatrix(as.factor(svmpredict),as.factor(bbbc_test$Choice), positive = '1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2011  147
##          1   85   57
##                                           
##                Accuracy : 0.8991          
##                  95% CI : (0.8861, 0.9111)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 0.9802          
##                                           
##                   Kappa : 0.2768          
##                                           
##  Mcnemar's Test P-Value : 6.206e-05       
##                                           
##             Sensitivity : 0.27941         
##             Specificity : 0.95945         
##          Pos Pred Value : 0.40141         
##          Neg Pred Value : 0.93188         
##              Prevalence : 0.08870         
##          Detection Rate : 0.02478         
##    Detection Prevalence : 0.06174         
##       Balanced Accuracy : 0.61943         
##                                           
##        'Positive' Class : 1               
##

questions how to prioritize sensitivity? should we try interaction terms?

Bookbinders Case Study

Brett Stokes

2/27/2021