library(readxl)
library(car)
## Loading required package: carData
library(olsrr)
## Warning: package 'olsrr' was built under R version 4.0.3
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(ResourceSelection)
## Warning: package 'ResourceSelection' was built under R version 4.0.3
## ResourceSelection 0.3-5 2019-07-22
library(ROCR)
## Warning: package 'ROCR' was built under R version 4.0.5
library(e1071)
BBBC.train <- read_excel("BBBC-Train.xlsx")
BBBC.test <- read_excel("BBBC-Test.xlsx")
Removing “Observation” variable and converting “Choice and Gender” to factors.
BBBC.train$Observation <- NULL
BBBC.test$Observation <- NULL
BBBC.train$Choice <- as.factor(BBBC.train$Choice)
BBBC.train$Gender <- as.factor(BBBC.train$Gender)
BBBC.test$Choice <- as.factor(BBBC.test$Choice)
BBBC.test$Gender <- as.factor(BBBC.test$Gender)
Create 70/30 split for GLM
set.seed(12345)
pseudo.index <- sample(1:nrow(BBBC.train), 0.7*nrow(BBBC.train))
BBBC.pseudo.train <- BBBC.train[pseudo.index, ]
BBBC.pseudo.test <- BBBC.train[-pseudo.index, ]
BBBC.pseudo.test.true <- BBBC.pseudo.test$Choice
Exploratory Analysis:
Relationships between variables:
pairs(BBBC.train)
Correlation table:
cor(BBBC.train[sapply(BBBC.train, is.numeric)])
## Amount_purchased Frequency Last_purchase First_purchase
## Amount_purchased 1.00000000 0.0136664846 0.44070127 0.3748139
## Frequency 0.01366648 1.0000000000 -0.04194328 0.4459457
## Last_purchase 0.44070127 -0.0419432803 1.00000000 0.8146747
## First_purchase 0.37481393 0.4459457457 0.81467469 1.0000000
## P_Child 0.29931372 -0.0433279437 0.67913392 0.5448208
## P_Youth 0.18755727 -0.0095854745 0.45325891 0.3678921
## P_Cook 0.30425340 0.0004968833 0.67250539 0.5710548
## P_DIY 0.22331539 -0.0089634125 0.55816739 0.4620188
## P_Art 0.27248948 -0.0613754066 0.53433415 0.4420821
## P_Child P_Youth P_Cook P_DIY P_Art
## Amount_purchased 0.29931372 0.187557270 0.3042533969 0.223315392 0.27248948
## Frequency -0.04332794 -0.009585474 0.0004968833 -0.008963412 -0.06137541
## Last_purchase 0.67913392 0.453258910 0.6725053933 0.558167395 0.53433415
## First_purchase 0.54482083 0.367892128 0.5710547918 0.462018843 0.44208206
## P_Child 1.00000000 0.174826719 0.2947065185 0.253837077 0.22451285
## P_Youth 0.17482672 1.000000000 0.1816566401 0.188683456 0.14175122
## P_Cook 0.29470652 0.181656640 1.0000000000 0.271725126 0.19168076
## P_DIY 0.25383708 0.188683456 0.2717251256 1.000000000 0.20779106
## P_Art 0.22451285 0.141751220 0.1916807611 0.207791065 1.00000000
Highest correlations are between “First_purchase” and “Last_purchase”.
Boxplots for the non-categorical variables:
par(mfrow=c(3,3))
hist(BBBC.train$Amount_purchased, xlab = "Amount Purchased")
hist(BBBC.train$Frequency, xlab = "Total Num. Purchases in Study")
hist(BBBC.train$Last_purchase, xlab = "Months Since Last Purchase")
hist(BBBC.train$First_purchase, xlab = "Months Since First Purchase")
hist(BBBC.train$P_Child, xlab = "No. Children's Books Purchased")
hist(BBBC.train$P_Youth, xlab = "No. Youth Books Purchased")
hist(BBBC.train$P_Cook, xlab = "No. Cook Books Purchased")
hist(BBBC.train$P_DIY, xlab = "No. DIY Books Purchased")
hist(BBBC.train$P_Art, xlab = "No. Art Books Purchased")
“Amount Purchased” seems normally distributed, but all others have a heavy positive skew.
Linear Regression
lm1 <- lm(as.numeric(Choice) ~ ., data = BBBC.train)
summary(lm1)
##
## Call:
## lm(formula = as.numeric(Choice) ~ ., data = BBBC.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9603 -0.2462 -0.1161 0.1622 1.0588
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3642284 0.0307411 44.378 < 2e-16 ***
## Gender1 -0.1309205 0.0200303 -6.536 8.48e-11 ***
## Amount_purchased 0.0002736 0.0001110 2.464 0.0138 *
## Frequency -0.0090868 0.0021791 -4.170 3.21e-05 ***
## Last_purchase 0.0970286 0.0135589 7.156 1.26e-12 ***
## First_purchase -0.0020024 0.0018160 -1.103 0.2704
## P_Child -0.1262584 0.0164011 -7.698 2.41e-14 ***
## P_Youth -0.0963563 0.0201097 -4.792 1.81e-06 ***
## P_Cook -0.1414907 0.0166064 -8.520 < 2e-16 ***
## P_DIY -0.1352313 0.0197873 -6.834 1.17e-11 ***
## P_Art 0.1178494 0.0194427 6.061 1.68e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3788 on 1589 degrees of freedom
## Multiple R-squared: 0.2401, Adjusted R-squared: 0.2353
## F-statistic: 50.2 on 10 and 1589 DF, p-value: < 2.2e-16
Check for multicollinearity:
vif(lm1)
## Gender Amount_purchased Frequency Last_purchase
## 1.005801 1.248066 3.253860 18.770402
## First_purchase P_Child P_Youth P_Cook
## 9.685333 3.360349 1.775022 3.324928
## P_DIY P_Art
## 2.016910 2.273771
remove “Last Purchase”
lm1.novif <- lm(as.numeric(Choice) ~ . - Last_purchase, BBBC.train)
vif(lm1.novif)
## Gender Amount_purchased Frequency First_purchase
## 1.005634 1.235982 2.651820 7.182666
## P_Child P_Youth P_Cook P_DIY
## 1.949849 1.307915 2.009609 1.457362
## P_Art
## 1.634878
Check model for significant variables:
summary(lm1.novif)
##
## Call:
## lm(formula = as.numeric(Choice) ~ . - Last_purchase, data = BBBC.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0018 -0.2482 -0.1277 0.1567 1.1035
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3926595 0.0309609 44.981 < 2e-16 ***
## Gender1 -0.1290720 0.0203424 -6.345 2.89e-10 ***
## Amount_purchased 0.0003518 0.0001122 3.135 0.001753 **
## Frequency -0.0157943 0.0019980 -7.905 4.97e-15 ***
## First_purchase 0.0046036 0.0015884 2.898 0.003803 **
## P_Child -0.0502183 0.0126891 -3.958 7.90e-05 ***
## P_Youth -0.0225339 0.0175326 -1.285 0.198888
## P_Cook -0.0667467 0.0131127 -5.090 4.00e-07 ***
## P_DIY -0.0606486 0.0170835 -3.550 0.000396 ***
## P_Art 0.1916012 0.0167447 11.443 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3847 on 1590 degrees of freedom
## Multiple R-squared: 0.2156, Adjusted R-squared: 0.2111
## F-statistic: 48.55 on 9 and 1590 DF, p-value: < 2.2e-16
Check diagnostic plots:
par(mfrow=c(2,2))
plot(lm1.novif, which=c(1:4))
Evidence of non-normality.
Linear Regression with stepwise selection:
lm1.stepwise <- ols_step_both_p(lm1, pent = 0.05, prem = 0.05, details = F)
lm1.stepwise
##
## Stepwise Selection Summary
## -----------------------------------------------------------------------------------------------
## Added/ Adj.
## Step Variable Removed R-Square R-Square C(p) AIC RMSE
## -----------------------------------------------------------------------------------------------
## 1 P_Art addition 0.128 0.127 227.4360 1649.2034 0.4046
## 2 Frequency addition 0.170 0.169 142.0340 1572.6125 0.3949
## 3 Gender addition 0.188 0.186 106.5840 1539.7192 0.3908
## 4 P_Cook addition 0.200 0.198 82.3710 1516.8351 0.3879
## 5 P_DIY addition 0.204 0.201 77.1250 1511.8878 0.3871
## 6 Amount_purchased addition 0.208 0.205 70.8160 1505.8838 0.3863
## 7 P_Child addition 0.211 0.208 64.8840 1500.2051 0.3855
## 8 Last_purchase addition 0.228 0.224 31.7060 1467.7006 0.3815
## 9 P_Youth addition 0.239 0.235 10.2160 1446.2388 0.3788
## -----------------------------------------------------------------------------------------------
lm1.step <- lm(as.numeric(Choice)~ P_Art + Frequency + Gender + P_Cook + P_DIY + Amount_purchased + P_Child + Last_purchase + P_Youth, BBBC.train)
summary(lm1.step)
##
## Call:
## lm(formula = as.numeric(Choice) ~ P_Art + Frequency + Gender +
## P_Cook + P_DIY + Amount_purchased + P_Child + Last_purchase +
## P_Youth, data = BBBC.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9802 -0.2452 -0.1157 0.1655 1.0595
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3727367 0.0297590 46.128 < 2e-16 ***
## P_Art 0.1150034 0.0192719 5.967 2.97e-09 ***
## Frequency -0.0110830 0.0012128 -9.138 < 2e-16 ***
## Gender1 -0.1316464 0.0200208 -6.575 6.56e-11 ***
## P_Cook -0.1433497 0.0165218 -8.676 < 2e-16 ***
## P_DIY -0.1365578 0.0197520 -6.914 6.82e-12 ***
## Amount_purchased 0.0002742 0.0001110 2.470 0.0136 *
## P_Child -0.1275991 0.0163571 -7.801 1.11e-14 ***
## Last_purchase 0.0894288 0.0116772 7.658 3.25e-14 ***
## P_Youth -0.0973642 0.0200903 -4.846 1.38e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3788 on 1590 degrees of freedom
## Multiple R-squared: 0.2395, Adjusted R-squared: 0.2352
## F-statistic: 55.63 on 9 and 1590 DF, p-value: < 2.2e-16
There is enough statistical evidence for us to reject the null hypothesis that all model coefficients are equal to zero.
Diagnostics:
par(mfrow=c(2,2))
plot(lm1.step, which=c(1:4))
Check VIF again:
vif(lm1.step)
## P_Art Frequency Gender P_Cook
## 2.233698 1.007855 1.004715 3.290657
## P_DIY Amount_purchased P_Child Last_purchase
## 2.009454 1.248033 3.341880 13.920175
## P_Youth
## 1.771355
“Last_purchase” has a very high VIF value, and the QQ plot shows non-normality.
LOGIT:
glm.base <- glm(Choice ~ ., BBBC.pseudo.train, family = "binomial")
vif(glm.base)
## Gender Amount_purchased Frequency Last_purchase
## 1.027333 1.231797 2.531715 17.150066
## First_purchase P_Child P_Youth P_Cook
## 9.337105 3.080295 1.831745 3.258309
## P_DIY P_Art
## 2.100789 2.048933
Remove “Last Purchase” and try again.
glm.base<-glm(Choice ~ . - Last_purchase, BBBC.pseudo.train, family="binomial")
vif(glm.base)
## Gender Amount_purchased Frequency First_purchase
## 1.027489 1.215065 2.219704 7.129562
## P_Child P_Youth P_Cook P_DIY
## 1.916577 1.392179 1.999162 1.492414
## P_Art
## 1.656146
Remove “First purchase” and try again:
glm.base <- glm(Choice ~ . - Last_purchase - First_purchase, BBBC.pseudo.train, family = "binomial")
vif(glm.base)
## Gender Amount_purchased Frequency P_Child
## 1.028240 1.209582 1.014939 1.226670
## P_Youth P_Cook P_DIY P_Art
## 1.084659 1.252361 1.194367 1.251723
Run our final model:
glm.null <- glm(Choice ~ 1, BBBC.pseudo.train, family = "binomial")
glm.full <- glm(Choice ~ . - Last_purchase - First_purchase, BBBC.pseudo.train, family = "binomial")
glm.step <- step(glm.null, scope = list(upper = glm.full),
direction = "both", test="Chisq", trace = F)
summary(glm.step)
##
## Call:
## glm(formula = Choice ~ P_Art + Frequency + Gender + P_Cook +
## Amount_purchased + P_DIY + P_Child, family = "binomial",
## data = BBBC.pseudo.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2809 -0.6880 -0.4703 -0.1612 2.6014
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.4675401 0.2439857 -1.916 0.055332 .
## P_Art 1.2303020 0.1174228 10.478 < 2e-16 ***
## Frequency -0.0866254 0.0122194 -7.089 1.35e-12 ***
## Gender1 -0.7690962 0.1617603 -4.755 1.99e-06 ***
## P_Cook -0.3072905 0.0883607 -3.478 0.000506 ***
## Amount_purchased 0.0027430 0.0009229 2.972 0.002956 **
## P_DIY -0.2677571 0.1246974 -2.147 0.031773 *
## P_Child -0.1728451 0.0859291 -2.011 0.044274 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1248.5 on 1119 degrees of freedom
## Residual deviance: 1006.8 on 1112 degrees of freedom
## AIC: 1022.8
##
## Number of Fisher Scoring iterations: 5
Goodness of Fit:
hoslem.test(glm.step$y, fitted(glm.step), g=10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: glm.step$y, fitted(glm.step)
## X-squared = 4.5158, df = 8, p-value = 0.8078
Here we see that the p-value of the Hosmer Lemeshow test indicates we fail to reject the null hypothesis, so we can say that our stepwise logistical model fits the data well
Check for influential points with Cook’s Distance:
plot(glm.step, which = 4)
glm.step.probs <- predict(glm.step, BBBC.pseudo.test, type = "response")
glm.step.preds <- rep(0, length(glm.step.probs))
glm.step.preds[glm.step.probs > 0.5] = 1
caret::confusionMatrix(as.factor(glm.step.preds), BBBC.pseudo.test$Choice, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 341 87
## 1 14 38
##
## Accuracy : 0.7896
## 95% CI : (0.7503, 0.8252)
## No Information Rate : 0.7396
## P-Value [Acc > NIR] : 0.00638
##
## Kappa : 0.3263
##
## Mcnemar's Test P-Value : 7.82e-13
##
## Sensitivity : 0.30400
## Specificity : 0.96056
## Pos Pred Value : 0.73077
## Neg Pred Value : 0.79673
## Prevalence : 0.26042
## Detection Rate : 0.07917
## Detection Prevalence : 0.10833
## Balanced Accuracy : 0.63228
##
## 'Positive' Class : 1
##
78.96% accuracy.
Compare against the test set:
glm.step.probs <- predict(glm.step, BBBC.test, type = "response")
glm.step.preds <- rep(0, length(glm.step.probs))
glm.step.preds[glm.step.probs > 0.5] = 1
caret::confusionMatrix(as.factor(glm.step.preds), BBBC.test$Choice, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1990 133
## 1 106 71
##
## Accuracy : 0.8961
## 95% CI : (0.8829, 0.9083)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 0.99461
##
## Kappa : 0.3164
##
## Mcnemar's Test P-Value : 0.09261
##
## Sensitivity : 0.34804
## Specificity : 0.94943
## Pos Pred Value : 0.40113
## Neg Pred Value : 0.93735
## Prevalence : 0.08870
## Detection Rate : 0.03087
## Detection Prevalence : 0.07696
## Balanced Accuracy : 0.64873
##
## 'Positive' Class : 1
##
89.61% accuracy
Profit with un-optimized GLM:
VC.mail.posting <- 0.65
VC.book <- 15
VC.book.OH <- VC.book*0.45
P.book <- 31.95
Prof.book <- P.book - VC.mail.posting - VC.book - VC.book.OH
N.Cust <- 2300
N.booksold <- 74
N.mailing <- 106
Profit.unOpt <- N.booksold * (Prof.book) - N.mailing * (VC.mail.posting)
Prof.per.Cust.unOpt <- Profit.unOpt / N.Cust
Profit.unOpt
## [1] 637.8
$637.8
Prof.per.Cust.unOpt
## [1] 0.2773043
28 cents per customer.
Tuning the selection threshold:
pr = predict(glm.step, BBBC.pseudo.test, type = "response")
response = BBBC.pseudo.test$Choice
pred <- prediction(pr, response)
auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)
false.rates <-performance(pred, "fpr","fnr")
accuracy <-performance(pred, "acc","err")
perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))
plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values),
type="l", lwd=2,
ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values),
type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, col='red')
min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,2)), pos = 4)
optimal prediction threshold is 0.22
Check confusion matrix:
glm.step.probs <- predict(glm.step, BBBC.test, type = "response")
glm.step.preds <- rep(0, length(glm.step.probs))
glm.step.preds[glm.step.probs > 0.22] = 1
caret::confusionMatrix(as.factor(glm.step.preds), BBBC.test$Choice, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1490 57
## 1 606 147
##
## Accuracy : 0.7117
## 95% CI : (0.6927, 0.7302)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1948
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.72059
## Specificity : 0.71088
## Pos Pred Value : 0.19522
## Neg Pred Value : 0.96315
## Prevalence : 0.08870
## Detection Rate : 0.06391
## Detection Prevalence : 0.32739
## Balanced Accuracy : 0.71573
##
## 'Positive' Class : 1
##
71.17% accuracy
Profit estimation:
N.Cust <- 2300
N.booksold <- 149
N.mailing <- 634
Profit.Opt <- N.booksold * (Prof.book) - N.mailing * (VC.mail.posting)
Prof.per.Cust.Opt <- Profit.Opt / N.Cust
Profit.Opt
## [1] 1010.85
$1,010.85
Prof.per.Cust.Opt
## [1] 0.4395
43.95 cents per customer.
Percent improvement of optimized to un-optimized GLM:
glm.tuned.pct.imp <- ((Profit.Opt - Profit.unOpt) / Profit.unOpt) * 100
glm.tuned.pct.imp
## [1] 58.49012
“1” Test set:
Test.Choice.1 <- sum(BBBC.test$Choice == 1)
Test.Choice.1
## [1] 204
Test.Choice.0 <- sum(BBBC.test$Choice == 0)
Test.Choice.0
## [1] 2096
Test.Choice.0+Test.Choice.1
## [1] 2300
“1” Training set:
Training.Choice.1 <- sum(BBBC.train$Choice == 1)
Training.Choice.1
## [1] 400
Training.Choice.0 <- sum(BBBC.train$Choice == 0)
Training.Choice.0
## [1] 1200
Training.Choice.0+Training.Choice.1
## [1] 1600
If Bookbinders used no model at all:
N.Cust <- 2300
N.booksold <- 204
N.mailing <- 2096
Profit.NoMod <- N.booksold * (Prof.book) - N.mailing * (VC.mail.posting)
Prof.per.Cust.NoMod <- Profit.NoMod / N.Cust
Profit.NoMod
## [1] 585.8
$585.80
Prof.per.Cust.NoMod
## [1] 0.2546957
25.47 cents per customer.
glm.tuned.pct.boost <- ((Profit.Opt - Profit.NoMod) / Profit.NoMod) * 100
glm.tuned.pct.boost
## [1] 72.55889
Our model boosts profit by 72.55%.
Tune the model to minimize its error:
cut.seq <- seq(0.01, 0.99, by = 0.01)
seq.glm <- glm(Choice ~ P_Art + Frequency + Gender + P_Cook + P_Child + P_DIY, family = "binomial", data = BBBC.pseudo.train)
Run the new GLM:
err=c();
for(i in 1:length(cut.seq)){
est.prob = predict(seq.glm, newdata=BBBC.pseudo.test[ ,-1], type="response")
est.class = ifelse(est.prob > cut.seq[i], 1, 0)
err[i] = mean(BBBC.pseudo.test.true != est.class, na.rm=TRUE)
}
par(mfrow=c(1,1))
plot(cut.seq, err, type="l")
min(err)
## [1] 0.20625
cut.seq[which.min(err)]
## [1] 0.31
final.cut = cut.seq[which.min(err)]
seq.glm.probs <- predict(seq.glm, BBBC.test, type = "response")
seq.glm.preds <- rep(0, length(seq.glm.probs))
seq.glm.preds[seq.glm.probs > final.cut] = 1
caret::confusionMatrix(as.factor(seq.glm.preds), BBBC.test$Choice, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1759 93
## 1 337 111
##
## Accuracy : 0.813
## 95% CI : (0.7965, 0.8288)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2489
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.54412
## Specificity : 0.83922
## Pos Pred Value : 0.24777
## Neg Pred Value : 0.94978
## Prevalence : 0.08870
## Detection Rate : 0.04826
## Detection Prevalence : 0.19478
## Balanced Accuracy : 0.69167
##
## 'Positive' Class : 1
##
81.3% accuracy
N.Cust <- 2300
N.booksold <- 59
N.mailing <- 73
Profit.Opt <- N.booksold * (Prof.book) - N.mailing * (VC.mail.posting)
Prof.per.Cust.Opt <- Profit.Opt / N.Cust
Profit.Opt
## [1] 516
$516
Prof.per.Cust.Opt
## [1] 0.2243478
22.43 centers per customer.
SVM:
tuned.svm <- tune.svm(Choice ~ ., data=BBBC.train, kernel = 'linear',
gamma = seq(0.01, 0.1, by = 0.025),
cost = seq(0.1, 1.2, by=0.1), scale=TRUE)
tuned.svm$best.parameters
SVM with tuned parameters:
svm1 <- svm(Choice ~ ., data = BBBC.train, kernel = 'linear',
gamma = tuned.svm$best.parameters$gamma,
cost = tuned.svm$best.parameters$cost)
Checking predictions for SVM1 against the training data:
svm.preds <- predict(svm1, BBBC.train)
caret::confusionMatrix(svm.preds, BBBC.train$Choice, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1149 270
## 1 51 130
##
## Accuracy : 0.7994
## 95% CI : (0.7789, 0.8187)
## No Information Rate : 0.75
## P-Value [Acc > NIR] : 1.765e-06
##
## Kappa : 0.3456
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.32500
## Specificity : 0.95750
## Pos Pred Value : 0.71823
## Neg Pred Value : 0.80973
## Prevalence : 0.25000
## Detection Rate : 0.08125
## Detection Prevalence : 0.11313
## Balanced Accuracy : 0.64125
##
## 'Positive' Class : 1
##
79.88% accuracy
Running SVM on testing data:
svm.preds <- predict(svm1, BBBC.test)
caret::confusionMatrix(svm.preds, BBBC.test$Choice, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2015 147
## 1 81 57
##
## Accuracy : 0.9009
## 95% CI : (0.8879, 0.9128)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 0.9621
##
## Kappa : 0.2819
##
## Mcnemar's Test P-Value : 1.672e-05
##
## Sensitivity : 0.27941
## Specificity : 0.96135
## Pos Pred Value : 0.41304
## Neg Pred Value : 0.93201
## Prevalence : 0.08870
## Detection Rate : 0.02478
## Detection Prevalence : 0.06000
## Balanced Accuracy : 0.62038
##
## 'Positive' Class : 1
##
89.91% accuracy
Profit numbers on linear kernel SVM:
N.Cust <- 2300
N.booksold <- 57
N.mailing <- 85
Profit.LSVM <- N.booksold * (Prof.book) - N.mailing * (VC.mail.posting)
Prof.per.Cust.LSVM <- Profit.LSVM / N.Cust
Profit.LSVM
## [1] 489.1
$489.10
Prof.per.Cust.LSVM
## [1] 0.2126522
21.27 cents per customer
Tuning on RBF:
tuned.svm.rad <- tune.svm(Choice ~ ., data=BBBC.train, kernel = 'radial',
gamma = seq(0.01, 0.1, by = 0.01),
cost = seq(0.1, 1.2, by=0.25), scale=TRUE)
tuned.svm.rad$best.parameters
svm2 <- svm(Choice ~ ., data = BBBC.train, kernel = 'radial',
gamma = tuned.svm.rad$best.parameters$gamma,
cost = tuned.svm.rad$best.parameters$cost)
svm.preds.rad <- predict(svm2, BBBC.test)
caret::confusionMatrix(svm.preds.rad, BBBC.test$Choice, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2059 175
## 1 37 29
##
## Accuracy : 0.9078
## 95% CI : (0.8953, 0.9193)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 0.7355
##
## Kappa : 0.1792
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.14216
## Specificity : 0.98235
## Pos Pred Value : 0.43939
## Neg Pred Value : 0.92167
## Prevalence : 0.08870
## Detection Rate : 0.01261
## Detection Prevalence : 0.02870
## Balanced Accuracy : 0.56225
##
## 'Positive' Class : 1
##
90.78% accuracy
Profit:
N.Cust <- 2300
N.booksold <- 38
N.mailing <- 42
Profit.RSVM <- N.booksold * (Prof.book) - N.mailing * (VC.mail.posting)
Prof.per.Cust.RSVM <- Profit.RSVM / N.Cust
Profit.RSVM
## [1] 335.6
$335.6
Prof.per.Cust.RSVM
## [1] 0.145913
14.59 cents per customer
Evaluating a polynomial kernel SVM:
tuned.svm.poly <- tune.svm(Choice ~ ., data=BBBC.train, kernel = 'polynomial',
gamma = seq(0.01, 0.1, by = 0.01),
cost = seq(0.1, 1.2, by=0.25), scale=TRUE)
tuned.svm.poly$best.parameters
svm3 <- svm(Choice ~ ., data = BBBC.train, kernel = 'polynomial',
gamma = tuned.svm.poly$best.parameters$gamma,
cost = tuned.svm.poly$best.parameters$cost)
svm.preds.poly <- predict(svm3, BBBC.test)
caret::confusionMatrix(svm.preds.poly, BBBC.test$Choice, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2052 174
## 1 44 30
##
## Accuracy : 0.9052
## 95% CI : (0.8925, 0.9169)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 0.856
##
## Kappa : 0.177
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.14706
## Specificity : 0.97901
## Pos Pred Value : 0.40541
## Neg Pred Value : 0.92183
## Prevalence : 0.08870
## Detection Rate : 0.01304
## Detection Prevalence : 0.03217
## Balanced Accuracy : 0.56303
##
## 'Positive' Class : 1
##
90.52% accuracy.
SVM with poly kernel profit numbers:
N.Cust <- 2300
N.booksold <- 36
N.mailing <- 47
Profit.PSVM <- N.booksold * (Prof.book) - N.mailing * (VC.mail.posting)
Prof.per.Cust.PSVM <- Profit.PSVM / N.Cust
Profit.PSVM
## [1] 313.25
$313.25
Prof.per.Cust.PSVM
## [1] 0.1361957
13.61 cents per customer.