#load libraries in
library(tidyverse)
library(caret)
library(readxl)
library(e1071)
library(ROCR)
library(ggplot2)
library(GGally)
library(corrplot)
library(MASS)
# load in data
train.bb = read_excel("~/Downloads/BBBC-Train.xlsx")
test.bb = read_excel("~/Downloads/BBBC-Test.xlsx")
str(train.bb)
tibble [1,600 × 12] (S3: tbl_df/tbl/data.frame)
$ Observation : num [1:1600] 1 2 3 4 5 6 7 8 9 10 ...
$ Choice : num [1:1600] 1 1 1 1 1 1 1 1 1 1 ...
$ Gender : num [1:1600] 1 1 1 1 0 1 1 0 1 1 ...
$ Amount_purchased: num [1:1600] 113 418 336 180 320 268 198 280 393 138 ...
$ Frequency : num [1:1600] 8 6 18 16 2 4 2 6 12 10 ...
$ Last_purchase : num [1:1600] 1 11 6 5 3 1 12 2 11 7 ...
$ First_purchase : num [1:1600] 8 66 32 42 18 4 62 12 50 38 ...
$ P_Child : num [1:1600] 0 0 2 2 0 0 2 0 3 2 ...
$ P_Youth : num [1:1600] 1 2 0 0 0 0 3 2 0 3 ...
$ P_Cook : num [1:1600] 0 3 1 0 0 0 2 0 3 0 ...
$ P_DIY : num [1:1600] 0 2 1 1 1 0 1 0 0 0 ...
$ P_Art : num [1:1600] 0 3 2 1 2 0 2 0 2 1 ...
# missing values?
anyNA(train.bb)
[1] FALSE
anyNA(test.bb)
[1] FALSE
# Remove first column since it is unecessary.
train.bb = train.bb[-c(1)]
test.bb = test.bb[-c(1)]
str(train.bb)
tibble [1,600 × 11] (S3: tbl_df/tbl/data.frame)
$ Choice : num [1:1600] 1 1 1 1 1 1 1 1 1 1 ...
$ Gender : num [1:1600] 1 1 1 1 0 1 1 0 1 1 ...
$ Amount_purchased: num [1:1600] 113 418 336 180 320 268 198 280 393 138 ...
$ Frequency : num [1:1600] 8 6 18 16 2 4 2 6 12 10 ...
$ Last_purchase : num [1:1600] 1 11 6 5 3 1 12 2 11 7 ...
$ First_purchase : num [1:1600] 8 66 32 42 18 4 62 12 50 38 ...
$ P_Child : num [1:1600] 0 0 2 2 0 0 2 0 3 2 ...
$ P_Youth : num [1:1600] 1 2 0 0 0 0 3 2 0 3 ...
$ P_Cook : num [1:1600] 0 3 1 0 0 0 2 0 3 0 ...
$ P_DIY : num [1:1600] 0 2 1 1 1 0 1 0 0 0 ...
$ P_Art : num [1:1600] 0 3 2 1 2 0 2 0 2 1 ...
#Review
table(train.bb$Choice)
0 1
1200 400
table(test.bb$Choice)
0 1
2096 204
# Factor response variable for 'Choice' and 'Gender'
train.bb$Choice = as.factor(train.bb$Choice)
test.bb$Choice = as.factor(test.bb$Choice)
train.bb$Gender = as.factor(train.bb$Gender)
test.bb$Gender = as.factor(test.bb$Gender)
EXPLORE DATA
# Data Distribution
train.bb$Last_purchase <- as.numeric(train.bb$Last_purchase)
par(mfrow=c(3,3))
hist(train.bb$Amount_purchased, xlab = "Amount Purchased", main = NULL)
hist(train.bb$Frequency, xlab = "Total Num. Purchases in Study", main = NULL)
hist(train.bb$Last_purchase, xlab = "Months Since Last Purchase", main = NULL)
hist(train.bb$First_purchase, xlab = "Months Since First Purchase", main = NULL)
hist(train.bb$P_Child, xlab = "No. Children's Books Purchased", main = NULL)
hist(train.bb$P_Youth, xlab = "No. Youth Books Purchased", main = NULL)
hist(train.bb$P_Cook, xlab = "No. Cook Books Purchased", main = NULL)
hist(train.bb$P_DIY, xlab = "No. DIY Books Purchased", main = NULL)
hist(train.bb$P_Art, xlab = "No. Art Books Purchased", main = NULL)
#Collinearity
corr.bb = subset(train.bb, select = -Choice)
corr.bb = subset(corr.bb, select = -Gender)
corrplot(cor(corr.bb))
## 'first_purchase' and 'last_purchase' look to have positive collinearity.
#Choice by Gender
ggplot(data = train.bb, aes(x = factor(ifelse(Choice == 1, "Purchased Book", "No Purchase" )),
fill = factor(ifelse(Gender == 0, "Female", "Male")))) + geom_bar(alpha = 0.6, color = "grey20", stat="count") +
stat_count(geom = "text", colour = "grey20", size = 3.5,
aes(label = paste("n = ", ..count..)),
position=position_stack(vjust=0.5)) +
labs(title = "# of Customer Orders by Gender",
x = "", y = "", fill="Gender")
ANALYSIS
Linear Regression Model:
lm.bb <- lm(as.numeric(Choice)~., data = train.bb)
par(mfrow=c(2,2))
plot(lm.bb)
#calculate MSE
lm.summary <-summary(lm.bb)
mean(lm.summary$residuals^2)
[1] 0.1424887
#MSE = .1425
#The linear regression model isn't ideal since this is a classification problem, and a linear model is used for continuous variables.
Logistic Regression/Logit Fit Model:
Model 1
#fit model
glm.bb = glm(Choice ~ ., data = train.bb, family = binomial)
summary(glm.bb)
Call:
glm(formula = Choice ~ ., family = binomial, data = train.bb)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.38586 -0.66728 -0.43696 -0.02242 2.72238
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.3515281 0.2143839 -1.640 0.1011
Gender1 -0.8632319 0.1374499 -6.280 3.38e-10 ***
Amount_purchased 0.0018641 0.0007918 2.354 0.0186 *
Frequency -0.0755142 0.0165937 -4.551 5.35e-06 ***
Last_purchase 0.6117713 0.0938127 6.521 6.97e-11 ***
First_purchase -0.0147792 0.0128027 -1.154 0.2483
P_Child -0.8112489 0.1167067 -6.951 3.62e-12 ***
P_Youth -0.6370422 0.1433778 -4.443 8.87e-06 ***
P_Cook -0.9230066 0.1194814 -7.725 1.12e-14 ***
P_DIY -0.9058697 0.1437025 -6.304 2.90e-10 ***
P_Art 0.6861124 0.1270176 5.402 6.60e-08 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1799.5 on 1599 degrees of freedom
Residual deviance: 1392.2 on 1589 degrees of freedom
AIC: 1414.2
Number of Fisher Scoring iterations: 5
# Model 1 Predictions
glm.prob = predict.glm(glm.bb, newdata = test.bb, type='response')
glm.pred = ifelse(glm.prob >= optimal,1,0)
caret::confusionMatrix(as.factor(glm.pred), as.factor(test.bb$Choice), positive = '1')
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 1536 57
1 560 147
Accuracy : 0.7317
95% CI : (0.7131, 0.7498)
No Information Rate : 0.9113
P-Value [Acc > NIR] : 1
Kappa : 0.2146
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.72059
Specificity : 0.73282
Pos Pred Value : 0.20792
Neg Pred Value : 0.96422
Prevalence : 0.08870
Detection Rate : 0.06391
Detection Prevalence : 0.30739
Balanced Accuracy : 0.72671
'Positive' Class : 1
#Remove insignificant variables: 'First_purchase' and 'Amount_purchased'
glm.bb2= glm(Choice ~ . -First_purchase -Amount_purchased, data = train.bb, family = binomial)
summary(glm.bb2)
Call:
glm(formula = Choice ~ . - First_purchase - Amount_purchased,
family = binomial, data = train.bb)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.49591 -0.66246 -0.44140 -0.01451 2.79621
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.00390 0.16493 0.024 0.981
Gender1 -0.86499 0.13703 -6.312 2.75e-10 ***
Frequency -0.08974 0.01064 -8.436 < 2e-16 ***
Last_purchase 0.57397 0.07822 7.338 2.16e-13 ***
P_Child -0.81070 0.11618 -6.978 3.00e-12 ***
P_Youth -0.64318 0.14338 -4.486 7.27e-06 ***
P_Cook -0.92741 0.11890 -7.800 6.19e-15 ***
P_DIY -0.90981 0.14307 -6.359 2.03e-10 ***
P_Art 0.67881 0.12538 5.414 6.17e-08 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1799.5 on 1599 degrees of freedom
Residual deviance: 1398.9 on 1591 degrees of freedom
AIC: 1416.9
Number of Fisher Scoring iterations: 5
Model 2
#Model 2 Predictions
glm.prob2 = predict.glm(glm.bb2, newdata = test.bb, type='response')
glm.pred2 = ifelse(glm.prob2 >= optimal,1,0)
caret::confusionMatrix(as.factor(glm.pred2), as.factor(test.bb$Choice), positive = "1")
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 1529 59
1 567 145
Accuracy : 0.7278
95% CI : (0.7091, 0.7459)
No Information Rate : 0.9113
P-Value [Acc > NIR] : 1
Kappa : 0.2073
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.71078
Specificity : 0.72948
Pos Pred Value : 0.20365
Neg Pred Value : 0.96285
Prevalence : 0.08870
Detection Rate : 0.06304
Detection Prevalence : 0.30957
Balanced Accuracy : 0.72013
'Positive' Class : 1
Finding Optimal Threshold: Sensitivity vs. Specificity Plot
#Finding Optimal Threshold: Sensitivity vs. Specificity Plot
pred.plot= prediction(predict(glm.bb2, train.bb, type='response'), train.bb$Choice)
plot(unlist(performance(pred.plot, "sens")@x.values),
unlist(performance(pred.plot, "sens")@y.values), type="l", lwd=2,
ylab="Sensitivity", xlab="Cutoff",
main = paste("Sensitivity vs. Specificity"))
par(new=TRUE)
plot(unlist(performance(pred.plot, "spec")@x.values),
unlist(performance(pred.plot, "spec")@y.values), type="l", lwd=2 , col='blue',
ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, col='blue')
#find intersection
min.diff = which.min(abs(unlist(performance(pred.plot, "sens")@y.values) - unlist(performance(pred.plot, "spec")@y.values)))
min.x = unlist(performance(pred.plot, "sens")@x.values)[min.diff]
min.y = unlist(performance(pred.plot, "spec")@y.values)[min.diff]
optimal = min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("Optimal Threshold = ", round(optimal,2)), pos = 4)
Support Vector Machines Model(SVM):
#tuning:
set.seed(10)
tune.bb = tune.svm(Choice ~ . -First_purchase -Amount_purchased, data = train.bb,
gamma = seq(.01, 0.1, by = .01), cost = seq(0.1, 1, by = .1), scale = TRUE)
#fit model:
svm.bb = svm(Choice ~ . -First_purchase -Amount_purchased, data = train.bb,
gamma = tune.bb$best.parameters$gamma, cost = tune.bb$best.parameters$cost, scale = TRUE)
summary(svm.bb)
Call:
svm(formula = Choice ~ . - First_purchase - Amount_purchased, data = train.bb, gamma = tune.bb$best.parameters$gamma,
cost = tune.bb$best.parameters$cost, scale = TRUE)
Parameters:
SVM-Type: C-classification
SVM-Kernel: radial
cost: 0.9
Number of Support Vectors: 795
( 364 431 )
Number of Classes: 2
Levels:
0 1
#make predictions:
svm.pred = predict(svm.bb, test.bb, type = 'response')
caret::confusionMatrix(as.factor(svm.pred), as.factor(test.bb$Choice), positive = '1')
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 2035 155
1 61 49
Accuracy : 0.9061
95% CI : (0.8934, 0.9177)
No Information Rate : 0.9113
P-Value [Acc > NIR] : 0.8208
Kappa : 0.2665
Mcnemar's Test P-Value : 2.486e-10
Sensitivity : 0.24020
Specificity : 0.97090
Pos Pred Value : 0.44545
Neg Pred Value : 0.92922
Prevalence : 0.08870
Detection Rate : 0.02130
Detection Prevalence : 0.04783
Balanced Accuracy : 0.60555
'Positive' Class : 1
LinearSVM Model:
#tuning:
tune.linear = tune(svm, Choice ~ .-First_purchase -Amount_purchased, data = train.bb, kernel = "linear",
ranges = list(cost = c(0.001, 0.01, 0.05, 0.1, 1, 5, 10, 100)))
#extract the best model:
svm.linear = tune.linear$best.model
#make predicitons:
lin.pred = predict(svm.linear, test.bb, type='response')
caret::confusionMatrix(as.factor(lin.pred), as.factor(test.bb$Choice), positive='1')
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 2018 152
1 78 52
Accuracy : 0.9
95% CI : (0.887, 0.912)
No Information Rate : 0.9113
P-Value [Acc > NIR] : 0.9724
Kappa : 0.2603
Mcnemar's Test P-Value : 1.483e-06
Sensitivity : 0.25490
Specificity : 0.96279
Pos Pred Value : 0.40000
Neg Pred Value : 0.92995
Prevalence : 0.08870
Detection Rate : 0.02261
Detection Prevalence : 0.05652
Balanced Accuracy : 0.60884
'Positive' Class : 1