choice: whether the customer purchased the art history of Florence. 1- represents a purchase 0- representa a nonpurchase
Gender: 0 = Female , 1 = Male Amount_Purchased - Total money spent on BBBC books
Frequency: Total number of purchases in the chosen period
Last_Purchase: Months since last purchase
First_Purchase: Months since first purchase
P_Child: Number of children’s books purchased
P_Youth: Number of youth books purchased
P_Cook: Number of cookbooks purchased
P_DIY: Number of do-it-yourself books purchased
P_Art: Number of art books purchased
library(readxl)
library(e1071)
library(caret)
library(ROCR)
library(ggplot2)
library(tidyverse)
library(MASS)
library(ggThemeAssist)
library(esquisse)
library(gridExtra)
library(trackdown)
library(corrplot)
source('/Users/thomasfarrell/Downloads/optim_threshold.R')
bbtrain = read_excel("/Users/thomasfarrell/Downloads/BBBC-Train.xlsx")
bbtest = read_excel("/Users/thomasfarrell/Downloads/BBBC-Test.xlsx")
colSums(is.na(bbtest))
## Observation Choice Gender Amount_purchased
## 0 0 0 0
## Frequency Last_purchase First_purchase P_Child
## 0 0 0 0
## P_Youth P_Cook P_DIY P_Art
## 0 0 0 0
We remove the first variable in the data due to the fact that this column just held a placeholder number.
bbtrain = bbtrain[-c(1)]
bbtest = bbtest[-c(1)]
bbtrain$Choice = as.factor(bbtrain$Choice)
bbtest$Choice = as.factor(bbtest$Choice)
b1_num = dplyr::select_if(bbtrain, is.numeric)
M = cor(b1_num)
corrplot(M, method = "number")
cor(bbtrain[sapply(bbtrain, is.numeric)])
## Gender Amount_purchased Frequency Last_purchase
## Gender 1.000000000 -0.03060700 0.0321704951 -0.02896341
## Amount_purchased -0.030607000 1.00000000 0.0136664846 0.44070127
## Frequency 0.032170495 0.01366648 1.0000000000 -0.04194328
## Last_purchase -0.028963412 0.44070127 -0.0419432803 1.00000000
## First_purchase 0.001026138 0.37481393 0.4459457457 0.81467469
## P_Child -0.041475936 0.29931372 -0.0433279437 0.67913392
## P_Youth -0.014130306 0.18755727 -0.0095854745 0.45325891
## P_Cook -0.026673876 0.30425340 0.0004968833 0.67250539
## P_DIY -0.025946174 0.22331539 -0.0089634125 0.55816739
## P_Art -0.003500037 0.27248948 -0.0613754066 0.53433415
## First_purchase P_Child P_Youth P_Cook
## Gender 0.001026138 -0.04147594 -0.014130306 -0.0266738763
## Amount_purchased 0.374813928 0.29931372 0.187557270 0.3042533969
## Frequency 0.445945746 -0.04332794 -0.009585474 0.0004968833
## Last_purchase 0.814674687 0.67913392 0.453258910 0.6725053933
## First_purchase 1.000000000 0.54482083 0.367892128 0.5710547918
## P_Child 0.544820825 1.00000000 0.174826719 0.2947065185
## P_Youth 0.367892128 0.17482672 1.000000000 0.1816566401
## P_Cook 0.571054792 0.29470652 0.181656640 1.0000000000
## P_DIY 0.462018843 0.25383708 0.188683456 0.2717251256
## P_Art 0.442082061 0.22451285 0.141751220 0.1916807611
## P_DIY P_Art
## Gender -0.025946174 -0.003500037
## Amount_purchased 0.223315392 0.272489483
## Frequency -0.008963412 -0.061375407
## Last_purchase 0.558167395 0.534334145
## First_purchase 0.462018843 0.442082061
## P_Child 0.253837077 0.224512850
## P_Youth 0.188683456 0.141751220
## P_Cook 0.271725126 0.191680761
## P_DIY 1.000000000 0.207791065
## P_Art 0.207791065 1.000000000
Highest correlations are between First_purchase and Last_purchase, which makes sense in a way. An only customer makes a purchase and walks out, that may be both a First purchase and Last purchase observation. Repeat customers will have a longer time in months for their first First purchase than their Last purchase.
p1 = ggplot(data = bbtrain, mapping = aes(x = Frequency, fill = Choice))+
geom_histogram() + theme(plot.title = element_text(face = "italic"),
panel.background = element_rect(fill = "gray90",
colour = "antiquewhite1", linetype = "dotted"),
plot.background = element_rect(fill = "white",
linetype = "dashed"))
p2 = ggplot(bbtrain) +
aes(x = Last_purchase, fill = Choice) +
geom_histogram(bins = 30L) +
scale_fill_hue(direction = 1) +
theme_minimal()
p3 = ggplot(bbtrain) +
aes(x = P_Child, fill = Choice) +
geom_histogram(bins = 30L) +
scale_fill_hue(direction = 1) +
theme_minimal()
p4 = ggplot(bbtrain) +
aes(x = Amount_purchased, fill = Choice) +
geom_histogram(bins = 30L) +
scale_fill_hue(direction = 1) +
theme_minimal()
grid.arrange(p1,p2,p3,p4)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
par(mfrow = c(3,3))
hist(bbtrain$Amount_purchased, xlab = "Amount Purchased", main = "Amount Purchased")
hist(bbtrain$Frequency, xlab = "Total Num. Purchased", main = "Total Purchased")
hist(bbtrain$Last_purchase, xlab = "Months Since Last Purchase", main = "Last Purchase")
hist(bbtrain$First_purchase, xlab = "Months Since First Purchase", main = "First Purchase")
hist(bbtrain$P_Child, xlab = "No. Children's Books Purchased", main = "Children's Books")
hist(bbtrain$P_Youth, xlab = "No. Youth Books Purchased", main = "Youth Books")
hist(bbtrain$P_Cook, xlab = "No. Cook Books Purchased", main = "Cook Books")
hist(bbtrain$P_DIY, xlab = "No. DIY Books Purchased", main = "DIY Books")
hist(bbtrain$P_Art, xlab = "No. Art Books Purchased", main = "Art Books")
From these charts we can see that the Amount Purchased variable is the only variable that looks normally distributed and they other variables appear to have a right skew in their results.
par(mfrow=c(2,2))
mod_1 = lm(as.numeric(Choice) ~., data = bbtrain)
plot(mod_1)
summary(mod_1)
##
## Call:
## lm(formula = as.numeric(Choice) ~ ., data = bbtrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9603 -0.2462 -0.1161 0.1622 1.0588
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3642284 0.0307411 44.378 < 2e-16 ***
## Gender -0.1309205 0.0200303 -6.536 8.48e-11 ***
## Amount_purchased 0.0002736 0.0001110 2.464 0.0138 *
## Frequency -0.0090868 0.0021791 -4.170 3.21e-05 ***
## Last_purchase 0.0970286 0.0135589 7.156 1.26e-12 ***
## First_purchase -0.0020024 0.0018160 -1.103 0.2704
## P_Child -0.1262584 0.0164011 -7.698 2.41e-14 ***
## P_Youth -0.0963563 0.0201097 -4.792 1.81e-06 ***
## P_Cook -0.1414907 0.0166064 -8.520 < 2e-16 ***
## P_DIY -0.1352313 0.0197873 -6.834 1.17e-11 ***
## P_Art 0.1178494 0.0194427 6.061 1.68e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3788 on 1589 degrees of freedom
## Multiple R-squared: 0.2401, Adjusted R-squared: 0.2353
## F-statistic: 50.2 on 10 and 1589 DF, p-value: < 2.2e-16
#VIF for linear model
car::vif(mod_1)
## Gender Amount_purchased Frequency Last_purchase
## 1.005801 1.248066 3.253860 18.770402
## First_purchase P_Child P_Youth P_Cook
## 9.685333 3.360349 1.775022 3.324928
## P_DIY P_Art
## 2.016910 2.273771
Last_Purchase and First_Purchase have a GVIF over 5 which tells us that we must remove these variables due to multiculinarity.
bbtrain = dplyr::select(bbtrain, - Last_purchase)
bbtrain = dplyr::select(bbtrain, - First_purchase)
mod_1 = lm(as.numeric(Choice) ~., data = bbtrain)
car::vif(mod_1)
## Gender Amount_purchased Frequency P_Child
## 1.003526 1.232595 1.007587 1.212223
## P_Youth P_Cook P_DIY P_Art
## 1.083475 1.214043 1.163794 1.138879
Our new model has improved as all our variables are under 5 GVIF.
glm.fit = glm(Choice ~ ., data = bbtrain, family = binomial)
summary(glm.fit)
##
## Call:
## glm(formula = Choice ~ ., family = binomial, data = bbtrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.31846 -0.69097 -0.47171 -0.02488 2.84182
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.286380 0.202966 -1.411 0.15825
## Gender -0.811948 0.134579 -6.033 1.61e-09 ***
## Amount_purchased 0.002406 0.000771 3.120 0.00181 **
## Frequency -0.088625 0.010385 -8.534 < 2e-16 ***
## P_Child -0.194796 0.072207 -2.698 0.00698 **
## P_Youth -0.031928 0.109605 -0.291 0.77082
## P_Cook -0.292392 0.072998 -4.005 6.19e-05 ***
## P_DIY -0.279282 0.108094 -2.584 0.00977 **
## P_Art 1.245842 0.099062 12.576 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1799.5 on 1599 degrees of freedom
## Residual deviance: 1445.0 on 1591 degrees of freedom
## AIC: 1463
##
## Number of Fisher Scoring iterations: 5
optim_threshold(glm.fit,bbtest, bbtest$Choice)
predprob = predict.glm(glm.fit, newdata = bbtest, type = "response")
predict.glm = ifelse(predprob >= .23, 1, 0)
caret::confusionMatrix(as.factor(predict.glm), as.factor(bbtest$Choice), positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1490 57
## 1 606 147
##
## Accuracy : 0.7117
## 95% CI : (0.6927, 0.7302)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1948
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.72059
## Specificity : 0.71088
## Pos Pred Value : 0.19522
## Neg Pred Value : 0.96315
## Prevalence : 0.08870
## Detection Rate : 0.06391
## Detection Prevalence : 0.32739
## Balanced Accuracy : 0.71573
##
## 'Positive' Class : 1
##
set.seed(1)
tuned = tune.svm(Choice ~ ., data = bbtrain, kernel = 'linear',gamma = seq(.01,.1,by = .025), cost = seq(.1,1.2, by = .1), scale = TRUE)
tuned$best.parameters
## gamma cost
## 13 0.01 0.4
#creating SVM using tuned parameters
svm1 = svm(Choice ~ ., data = bbtrain, kernel = 'linear', gamma = tuned$best.parameters$gamma, cost = tuned$best.parameters$cost)
#Make predictions on training data set
predSVM = predict(svm1, bbtrain)
caret::confusionMatrix(predSVM, bbtrain$Choice, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1153 286
## 1 47 114
##
## Accuracy : 0.7919
## 95% CI : (0.7711, 0.8115)
## No Information Rate : 0.75
## P-Value [Acc > NIR] : 4.577e-05
##
## Kappa : 0.307
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.28500
## Specificity : 0.96083
## Pos Pred Value : 0.70807
## Neg Pred Value : 0.80125
## Prevalence : 0.25000
## Detection Rate : 0.07125
## Detection Prevalence : 0.10063
## Balanced Accuracy : 0.62292
##
## 'Positive' Class : 1
##
The accuracy of our tuned model is 79.19% with a sensitivity of .285 and specificity of .96.
#make prediticions on the testing data set
predSVM = predict(svm1, bbtest)
caret::confusionMatrix(predSVM, bbtest$Choice, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2016 147
## 1 80 57
##
## Accuracy : 0.9013
## 95% CI : (0.8884, 0.9132)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 0.9559
##
## Kappa : 0.2832
##
## Mcnemar's Test P-Value : 1.184e-05
##
## Sensitivity : 0.27941
## Specificity : 0.96183
## Pos Pred Value : 0.41606
## Neg Pred Value : 0.93204
## Prevalence : 0.08870
## Detection Rate : 0.02478
## Detection Prevalence : 0.05957
## Balanced Accuracy : 0.62062
##
## 'Positive' Class : 1
##
With our tuned model against the testing data our model achieves an accuracy of 90.13% and a sensitivity of .279 and specificity of .961.