library(readxl)
library(knitr)
library(ggplot2)
library(gridExtra) #par(mfrow = c()) doesn't work with ggplot
library(stats)
library(car)
## Loading required package: carData
library(caret)
## Loading required package: lattice
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(e1071)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” forcats 1.0.0 âś” stringr 1.5.1
## âś” lubridate 1.9.3 âś” tibble 3.2.1
## âś” purrr 1.0.4 âś” tidyr 1.3.1
## âś” readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::combine() masks gridExtra::combine()
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag() masks stats::lag()
## âś– purrr::lift() masks caret::lift()
## âś– dplyr::recode() masks car::recode()
## âś– purrr::some() masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
bbtrain <- read_excel('/Users/ponce/Desktop/DA-6813/Case Study 2/BBBC-Train.xlsx')
bbtest <- read_excel('/Users/ponce/Desktop/DA-6813/Case Study 2/BBBC-Test.xlsx')
str(bbtrain)
## tibble [1,600 Ă— 12] (S3: tbl_df/tbl/data.frame)
## $ Observation : num [1:1600] 1 2 3 4 5 6 7 8 9 10 ...
## $ Choice : num [1:1600] 1 1 1 1 1 1 1 1 1 1 ...
## $ Gender : num [1:1600] 1 1 1 1 0 1 1 0 1 1 ...
## $ Amount_purchased: num [1:1600] 113 418 336 180 320 268 198 280 393 138 ...
## $ Frequency : num [1:1600] 8 6 18 16 2 4 2 6 12 10 ...
## $ Last_purchase : num [1:1600] 1 11 6 5 3 1 12 2 11 7 ...
## $ First_purchase : num [1:1600] 8 66 32 42 18 4 62 12 50 38 ...
## $ P_Child : num [1:1600] 0 0 2 2 0 0 2 0 3 2 ...
## $ P_Youth : num [1:1600] 1 2 0 0 0 0 3 2 0 3 ...
## $ P_Cook : num [1:1600] 0 3 1 0 0 0 2 0 3 0 ...
## $ P_DIY : num [1:1600] 0 2 1 1 1 0 1 0 0 0 ...
## $ P_Art : num [1:1600] 0 3 2 1 2 0 2 0 2 1 ...
Choice: Categorical Whether the
customer purchased the book ’The Art History of Florence = 1, or the
customer did not purchase the book = 0
Gender: Categorical Female = 0,
Male = 1
Amount_purchased: Numerical Total
money spent on BBBC books. (How much they’ve spent with us over
time)
Frequency: Numerical Total number
of purchases in the chosen period.
Last_purchase: Numerical Months
since last purchase.
First_purchase: Numerical Months
since first purchase.
P_Child: Numerical Number of
children books purchased.
P_Youth: Numerical Number of youth
books purchased.
P_Cook: Numerical Number of
cookbooks purchased.
P_DIY: Numerical Number of
do-it-yourself books purchased.
P_Art: Numerical Number of art
books purchased.
Check for missing values
anyNA(bbtrain)
## [1] FALSE
anyNA(bbtest)
## [1] FALSE
No missing values on any dataset.
Observation#Train data categorical variables
bbtrain$Choice <- as.factor(bbtrain$Choice)
bbtrain$Gender <- as.factor(bbtrain$Gender)
#Test data categorical variables
bbtest$Choice <- as.factor(bbtest$Choice)
bbtest$Gender <- as.factor(bbtest$Gender)
#Drop observation variable from both datasets
bbtrain <- subset(bbtrain, select = -c(Observation))
bbtest <- subset(bbtest, select = -c(Observation))
knitr::kable(summary(bbtrain))
| Choice | Gender | Amount_purchased | Frequency | Last_purchase | First_purchase | P_Child | P_Youth | P_Cook | P_DIY | P_Art | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0:1200 | 0: 546 | Min. : 15.0 | Min. : 2.00 | Min. : 1.000 | Min. : 2.00 | Min. :0.0000 | Min. :0.0000 | Min. :0.00 | Min. :0.0000 | Min. :0.000 | |
| 1: 400 | 1:1054 | 1st Qu.:126.8 | 1st Qu.: 6.00 | 1st Qu.: 1.000 | 1st Qu.:12.00 | 1st Qu.:0.0000 | 1st Qu.:0.0000 | 1st Qu.:0.00 | 1st Qu.:0.0000 | 1st Qu.:0.000 | |
| NA | NA | Median :203.0 | Median :12.00 | Median : 2.000 | Median :18.00 | Median :0.0000 | Median :0.0000 | Median :0.00 | Median :0.0000 | Median :0.000 | |
| NA | NA | Mean :200.9 | Mean :12.31 | Mean : 3.199 | Mean :22.58 | Mean :0.7394 | Mean :0.3375 | Mean :0.76 | Mean :0.3912 | Mean :0.425 | |
| NA | NA | 3rd Qu.:273.0 | 3rd Qu.:16.00 | 3rd Qu.: 4.000 | 3rd Qu.:30.00 | 3rd Qu.:1.0000 | 3rd Qu.:1.0000 | 3rd Qu.:1.00 | 3rd Qu.:1.0000 | 3rd Qu.:1.000 | |
| NA | NA | Max. :474.0 | Max. :36.00 | Max. :12.000 | Max. :96.00 | Max. :8.0000 | Max. :4.0000 | Max. :6.00 | Max. :4.0000 | Max. :5.000 |
knitr::kable(summary(bbtest))
| Choice | Gender | Amount_purchased | Frequency | Last_purchase | First_purchase | P_Child | P_Youth | P_Cook | P_DIY | P_Art | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0:2096 | 0: 721 | Min. : 15.0 | Min. : 2.0 | Min. : 1.000 | Min. : 2.00 | Min. :0.0000 | Min. :0.0000 | Min. :0.0000 | Min. :0.0000 | Min. :0.00 | |
| 1: 204 | 1:1579 | 1st Qu.:119.0 | 1st Qu.: 8.0 | 1st Qu.: 1.000 | 1st Qu.:12.00 | 1st Qu.:0.0000 | 1st Qu.:0.0000 | 1st Qu.:0.0000 | 1st Qu.:0.0000 | 1st Qu.:0.00 | |
| NA | NA | Median :198.0 | Median :12.0 | Median : 2.000 | Median :18.00 | Median :0.0000 | Median :0.0000 | Median :0.0000 | Median :0.0000 | Median :0.00 | |
| NA | NA | Mean :195.3 | Mean :13.3 | Mean : 3.059 | Mean :22.85 | Mean :0.7287 | Mean :0.3426 | Mean :0.7857 | Mean :0.4061 | Mean :0.33 | |
| NA | NA | 3rd Qu.:268.0 | 3rd Qu.:16.0 | 3rd Qu.: 4.000 | 3rd Qu.:32.00 | 3rd Qu.:1.0000 | 3rd Qu.:1.0000 | 3rd Qu.:1.0000 | 3rd Qu.:1.0000 | 3rd Qu.:1.00 | |
| NA | NA | Max. :461.0 | Max. :36.0 | Max. :12.000 | Max. :96.00 | Max. :7.0000 | Max. :5.0000 | Max. :6.0000 | Max. :4.0000 | Max. :4.00 |
Differences in Test and Train Datasets:
Similar distribution of Gender in both datasets.
Child, Youth, Cook, DIY, Art purchases have similar means in both datasets.
More observations in Test(1600) than
Train(2300) Data.
Significantly Lower amount of Yes in Test
Dataset.
#Plot for Train Dataset
p1 <- ggplot(bbtrain, aes(x = Choice)) +
geom_bar(fill = 'blue') +
labs(title = 'Frequency of Yes/No in Train Set',
x = 'Choice',
y = 'Frequency') +
theme_minimal()
#Plot for Test Dataset
p2 <- ggplot(bbtest, aes(x = Choice)) +
geom_bar(fill = 'red') +
labs(title = 'Frequency of Yes/No in Test Set',
x = 'Choice',
y = 'Frequency') +
theme_minimal()
grid.arrange(p1, p2, nrow = 1)
Even though our Test Dataset is larger, we have a lower amount of
Yes.
knitr::kable(cor(bbtrain[, sapply(bbtrain, is.numeric)]))
| Amount_purchased | Frequency | Last_purchase | First_purchase | P_Child | P_Youth | P_Cook | P_DIY | P_Art | |
|---|---|---|---|---|---|---|---|---|---|
| Amount_purchased | 1.0000000 | 0.0136665 | 0.4407013 | 0.3748139 | 0.2993137 | 0.1875573 | 0.3042534 | 0.2233154 | 0.2724895 |
| Frequency | 0.0136665 | 1.0000000 | -0.0419433 | 0.4459457 | -0.0433279 | -0.0095855 | 0.0004969 | -0.0089634 | -0.0613754 |
| Last_purchase | 0.4407013 | -0.0419433 | 1.0000000 | 0.8146747 | 0.6791339 | 0.4532589 | 0.6725054 | 0.5581674 | 0.5343341 |
| First_purchase | 0.3748139 | 0.4459457 | 0.8146747 | 1.0000000 | 0.5448208 | 0.3678921 | 0.5710548 | 0.4620188 | 0.4420821 |
| P_Child | 0.2993137 | -0.0433279 | 0.6791339 | 0.5448208 | 1.0000000 | 0.1748267 | 0.2947065 | 0.2538371 | 0.2245128 |
| P_Youth | 0.1875573 | -0.0095855 | 0.4532589 | 0.3678921 | 0.1748267 | 1.0000000 | 0.1816566 | 0.1886835 | 0.1417512 |
| P_Cook | 0.3042534 | 0.0004969 | 0.6725054 | 0.5710548 | 0.2947065 | 0.1816566 | 1.0000000 | 0.2717251 | 0.1916808 |
| P_DIY | 0.2233154 | -0.0089634 | 0.5581674 | 0.4620188 | 0.2538371 | 0.1886835 | 0.2717251 | 1.0000000 | 0.2077911 |
| P_Art | 0.2724895 | -0.0613754 | 0.5343341 | 0.4420821 | 0.2245128 | 0.1417512 | 0.1916808 | 0.2077911 | 1.0000000 |
Last_purchase and First_purchase have a
strong and positive correlation of 0.8147. Maybe
suggest that customers who made purchases recently tend to have made
purchases earlier as well?
Last_purchase and P_Child has a lightly
strong and positive correlation of 0.6791. Maybe
suggests that people who bought more recently are somewhat likely to
have a child?
First_purchase and P_Child moderate and
positive correlation of 0.5448. Maybe suggests that
people with children tend to have made purchases earlier/time ago?
par(mfrow = c(1, 3))
boxplot(bbtrain$Amount_purchased,
main = 'Amount_purchased')
boxplot(bbtrain$Frequency,
main = 'Frequency')
boxplot(bbtrain$Last_purchase,
main = 'Last_purchase')
par(mfrow = c(1, 1))
Amount_purchased seems to have no skew, maybe a bit?
Frequency has a slight positive skew with outliers on
the right side.
Last_purchase has a positive skew with a few
outliers
par(mfrow = c(1, 3))
boxplot(bbtrain$First_purchase,
main = 'First_purchase')
boxplot(bbtrain$P_Child,
main = 'P_Child')
boxplot(bbtrain$P_Youth,
main = 'P_Youth')
par(mfrow = c(1, 1))
First_purchase looks to be positively skewed with a lot
of outliers.
par(mfrow = c(1, 3))
boxplot(bbtrain$P_Cook,
main = 'P_Cook')
boxplot(bbtrain$P_DIY,
main = 'P_DIY')
boxplot(bbtrain$P_Art,
main = 'P_Art')
par(mfrow = c(1, 1))
All the P_BookType have the same Positive skew with a
few outliers on the right side. Must be because a lot of our data is
from Non-Buyers so the median is 0.
#For Linear Regression, we revert our response variable to numeric
full_linear_model <- lm(as.numeric(Choice) ~ ., data = bbtrain)
summary(full_linear_model)
##
## Call:
## lm(formula = as.numeric(Choice) ~ ., data = bbtrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9603 -0.2462 -0.1161 0.1622 1.0588
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3642284 0.0307411 44.378 < 2e-16 ***
## Gender1 -0.1309205 0.0200303 -6.536 8.48e-11 ***
## Amount_purchased 0.0002736 0.0001110 2.464 0.0138 *
## Frequency -0.0090868 0.0021791 -4.170 3.21e-05 ***
## Last_purchase 0.0970286 0.0135589 7.156 1.26e-12 ***
## First_purchase -0.0020024 0.0018160 -1.103 0.2704
## P_Child -0.1262584 0.0164011 -7.698 2.41e-14 ***
## P_Youth -0.0963563 0.0201097 -4.792 1.81e-06 ***
## P_Cook -0.1414907 0.0166064 -8.520 < 2e-16 ***
## P_DIY -0.1352313 0.0197873 -6.834 1.17e-11 ***
## P_Art 0.1178494 0.0194427 6.061 1.68e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3788 on 1589 degrees of freedom
## Multiple R-squared: 0.2401, Adjusted R-squared: 0.2353
## F-statistic: 50.2 on 10 and 1589 DF, p-value: < 2.2e-16
stepwise_linear_model <- step(full_linear_model, direction = 'both', trace = FALSE)
summary(stepwise_linear_model)
##
## Call:
## lm(formula = as.numeric(Choice) ~ Gender + Amount_purchased +
## Frequency + Last_purchase + P_Child + P_Youth + P_Cook +
## P_DIY + P_Art, data = bbtrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9802 -0.2452 -0.1157 0.1655 1.0595
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3727367 0.0297590 46.128 < 2e-16 ***
## Gender1 -0.1316464 0.0200208 -6.575 6.56e-11 ***
## Amount_purchased 0.0002742 0.0001110 2.470 0.0136 *
## Frequency -0.0110830 0.0012128 -9.138 < 2e-16 ***
## Last_purchase 0.0894288 0.0116772 7.658 3.25e-14 ***
## P_Child -0.1275991 0.0163571 -7.801 1.11e-14 ***
## P_Youth -0.0973642 0.0200903 -4.846 1.38e-06 ***
## P_Cook -0.1433497 0.0165218 -8.676 < 2e-16 ***
## P_DIY -0.1365578 0.0197520 -6.914 6.82e-12 ***
## P_Art 0.1150034 0.0192719 5.967 2.97e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3788 on 1590 degrees of freedom
## Multiple R-squared: 0.2395, Adjusted R-squared: 0.2352
## F-statistic: 55.63 on 9 and 1590 DF, p-value: < 2.2e-16
vif(stepwise_linear_model)
## Gender Amount_purchased Frequency Last_purchase
## 1.004715 1.248033 1.007855 13.920175
## P_Child P_Youth P_Cook P_DIY
## 3.341880 1.771355 3.290657 2.009454
## P_Art
## 2.233698
Last_purchase has a VIF higher than 5 so we remove
it.
final_linear_model <- lm(as.numeric(Choice) ~ . - First_purchase -Last_purchase,
data = bbtrain)
summary(final_linear_model)
##
## Call:
## lm(formula = as.numeric(Choice) ~ . - First_purchase - Last_purchase,
## data = bbtrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9501 -0.2518 -0.1273 0.1509 1.1211
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3731865 0.0302933 45.330 < 2e-16 ***
## Gender1 -0.1263728 0.0203683 -6.204 6.99e-10 ***
## Amount_purchased 0.0003688 0.0001123 3.283 0.00105 **
## Frequency -0.0112345 0.0012344 -9.101 < 2e-16 ***
## P_Child -0.0275983 0.0100284 -2.752 0.00599 **
## P_Youth -0.0014841 0.0159946 -0.093 0.92609
## P_Cook -0.0428346 0.0102155 -4.193 2.90e-05 ***
## P_DIY -0.0384262 0.0153017 -2.511 0.01213 *
## P_Art 0.2183323 0.0140081 15.586 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3856 on 1591 degrees of freedom
## Multiple R-squared: 0.2114, Adjusted R-squared: 0.2075
## F-statistic: 53.32 on 8 and 1591 DF, p-value: < 2.2e-16
P_Youth becomes insignificant, but we keep it in our
model because p-value is not the only measure of significance/predictive
power.
par(mfrow = c(2, 2))
plot(final_linear_model)
par(mfrow = c(1, 1))
Our Diagnostic plots are concerning. Residuals vs Fitted seem to be suggesting that our model missed out on explaining a non-linear relationship and was left out in the residuals.
Our Q-Q plot suggest non-normal residuals, which may suggest we need to do some transformations to the data.
Our Scale-Location plot suggests that our residuals are not spread equally along the predictors. Assumption of equal variance might be rejected.
Our residuals vs leverage show a few potential high leverage points.
pred_probs <- predict(final_linear_model, newdata = bbtest, type = 'response')
pred_class <- ifelse(pred_probs > 0.5, 1, 0)
pred_class <- factor(pred_class)
confusionMatrix(pred_class, bbtest$Choice, positive = '1')
## Warning in confusionMatrix.default(pred_class, bbtest$Choice, positive = "1"):
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 0 0
## 1 2096 204
##
## Accuracy : 0.0887
## 95% CI : (0.0774, 0.1011)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 1
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.0887
## Neg Pred Value : NaN
## Prevalence : 0.0887
## Detection Rate : 0.0887
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 1
##
Our performance metrics indicate very poor model performance, which is something we expected.
We knew something would go wrong because Linear
Regression is only used when the response variable is a
continuous value. In our case, Choice is a categorical
variables because it is composed of 1’s and 0’s which should be turned
into a categorical variable.
proc_linear <- roc(bbtest$Choice, pred_probs)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(proc_linear)
coords(proc_linear, 'best', ret = 'threshold')
## threshold
## 1 1.239856
pred_probs <- predict(final_linear_model, newdata = bbtest, type = 'response')
pred_class <- ifelse(pred_probs > 1.239856, 1, 0)
pred_class <- factor(pred_class)
confusionMatrix(pred_class, bbtest$Choice, positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1368 43
## 1 728 161
##
## Accuracy : 0.6648
## 95% CI : (0.6451, 0.6841)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1757
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7892
## Specificity : 0.6527
## Pos Pred Value : 0.1811
## Neg Pred Value : 0.9695
## Prevalence : 0.0887
## Detection Rate : 0.0700
## Detection Prevalence : 0.3865
## Balanced Accuracy : 0.7209
##
## 'Positive' Class : 1
##
After using the Precision Recall Curve to find the optimal threshold, it finds it should be 1.239 which is something we shouldn’t be getting since our pred_probs are quite literally probabilities, hence it would be unusual to have our threshold at something higher than 1.
full_logistic_model <- glm(Choice ~ ., data = bbtrain, family = 'binomial')
summary(full_logistic_model)
##
## Call:
## glm(formula = Choice ~ ., family = "binomial", data = bbtrain)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.3515281 0.2143839 -1.640 0.1011
## Gender1 -0.8632319 0.1374499 -6.280 3.38e-10 ***
## Amount_purchased 0.0018641 0.0007918 2.354 0.0186 *
## Frequency -0.0755142 0.0165937 -4.551 5.35e-06 ***
## Last_purchase 0.6117713 0.0938127 6.521 6.97e-11 ***
## First_purchase -0.0147792 0.0128027 -1.154 0.2483
## P_Child -0.8112489 0.1167067 -6.951 3.62e-12 ***
## P_Youth -0.6370422 0.1433778 -4.443 8.87e-06 ***
## P_Cook -0.9230066 0.1194814 -7.725 1.12e-14 ***
## P_DIY -0.9058697 0.1437025 -6.304 2.90e-10 ***
## P_Art 0.6861124 0.1270176 5.402 6.60e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1799.5 on 1599 degrees of freedom
## Residual deviance: 1392.2 on 1589 degrees of freedom
## AIC: 1414.2
##
## Number of Fisher Scoring iterations: 5
Multicollinearity Check
vif(full_logistic_model)
## Gender Amount_purchased Frequency Last_purchase
## 1.023359 1.232172 2.490447 17.706670
## First_purchase P_Child P_Youth P_Cook
## 9.247748 2.992269 1.761546 3.229097
## P_DIY P_Art
## 1.992698 1.938089
Last_purchase and First_purchase have a VIF
higher than 5
stepwise_logistic_model <- step(full_logistic_model, direction = 'both', trace = FALSE)
summary(stepwise_logistic_model)
##
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency +
## Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art,
## family = "binomial", data = bbtrain)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.2833949 0.2062721 -1.374 0.1695
## Gender1 -0.8660575 0.1373268 -6.307 2.85e-10 ***
## Amount_purchased 0.0018357 0.0007908 2.321 0.0203 *
## Frequency -0.0903261 0.0106304 -8.497 < 2e-16 ***
## Last_purchase 0.5536689 0.0784519 7.057 1.70e-12 ***
## P_Child -0.8181807 0.1163377 -7.033 2.02e-12 ***
## P_Youth -0.6424923 0.1432548 -4.485 7.29e-06 ***
## P_Cook -0.9330131 0.1190073 -7.840 4.51e-15 ***
## P_DIY -0.9101106 0.1433591 -6.348 2.17e-10 ***
## P_Art 0.6643371 0.1255243 5.292 1.21e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1799.5 on 1599 degrees of freedom
## Residual deviance: 1393.5 on 1590 degrees of freedom
## AIC: 1413.5
##
## Number of Fisher Scoring iterations: 5
vif(stepwise_logistic_model)
## Gender Amount_purchased Frequency Last_purchase
## 1.022844 1.233123 1.019533 12.384177
## P_Child P_Youth P_Cook P_DIY
## 2.976569 1.750190 3.200878 1.995789
## P_Art
## 1.883823
Last_purchase still has a VIF higher than 5 so we drop
it.
final_logistic_model <- glm(Choice ~ . - Last_purchase - First_purchase,
data = bbtrain, family = 'binomial')
summary(final_logistic_model)
##
## Call:
## glm(formula = Choice ~ . - Last_purchase - First_purchase, family = "binomial",
## data = bbtrain)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.286380 0.202966 -1.411 0.15825
## Gender1 -0.811948 0.134579 -6.033 1.61e-09 ***
## Amount_purchased 0.002406 0.000771 3.120 0.00181 **
## Frequency -0.088625 0.010385 -8.534 < 2e-16 ***
## P_Child -0.194796 0.072207 -2.698 0.00698 **
## P_Youth -0.031928 0.109605 -0.291 0.77082
## P_Cook -0.292392 0.072998 -4.005 6.19e-05 ***
## P_DIY -0.279282 0.108094 -2.584 0.00977 **
## P_Art 1.245842 0.099062 12.576 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1799.5 on 1599 degrees of freedom
## Residual deviance: 1445.0 on 1591 degrees of freedom
## AIC: 1463
##
## Number of Fisher Scoring iterations: 5
vif(final_logistic_model)
## Gender Amount_purchased Frequency P_Child
## 1.020217 1.213528 1.015899 1.215500
## P_Youth P_Cook P_DIY P_Art
## 1.081019 1.228798 1.179821 1.229491
pred_probs <- predict(final_logistic_model, newdata = bbtest, type = 'response')
pred_class <- ifelse(pred_probs > 0.5, 1, 0)
pred_class <- factor(pred_class)
# Confusion matrix
confusionMatrix(pred_class, bbtest$Choice, positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1984 132
## 1 112 72
##
## Accuracy : 0.8939
## 95% CI : (0.8806, 0.9062)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 0.9981
##
## Kappa : 0.3134
##
## Mcnemar's Test P-Value : 0.2239
##
## Sensitivity : 0.3529
## Specificity : 0.9466
## Pos Pred Value : 0.3913
## Neg Pred Value : 0.9376
## Prevalence : 0.0887
## Detection Rate : 0.0313
## Detection Prevalence : 0.0800
## Balanced Accuracy : 0.6498
##
## 'Positive' Class : 1
##
pred_probs <- predict(final_logistic_model, newdata = bbtest, type = 'response')
pred_class <- ifelse(pred_probs > 0.5, 1, 0)
pred_class <- factor(pred_class)
# Confusion matrix
confusionMatrix(pred_class, bbtest$Choice, positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1984 132
## 1 112 72
##
## Accuracy : 0.8939
## 95% CI : (0.8806, 0.9062)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 0.9981
##
## Kappa : 0.3134
##
## Mcnemar's Test P-Value : 0.2239
##
## Sensitivity : 0.3529
## Specificity : 0.9466
## Pos Pred Value : 0.3913
## Neg Pred Value : 0.9376
## Prevalence : 0.0887
## Detection Rate : 0.0313
## Detection Prevalence : 0.0800
## Balanced Accuracy : 0.6498
##
## 'Positive' Class : 1
##
High Accuracy but low Sensitivity due to class imbalance. After this we will test on the same model but with a different threshold to see how much better it gets.
proc_logistic <- roc(bbtest$Choice, pred_probs)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(proc_logistic)
coords(proc_logistic, "best", ret = "threshold")
## threshold
## 1 0.2053029
Our optimal threshold for our final model in this unbalanced dataset is 0.2053029
pred_probs <- predict(final_logistic_model, newdata = bbtest, type = 'response')
pred_class <- ifelse(pred_probs > 0.2053029, 1, 0)
pred_class <- factor(pred_class)
# Confusion matrix
confusionMatrix(pred_class, bbtest$Choice, positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1378 43
## 1 718 161
##
## Accuracy : 0.6691
## 95% CI : (0.6495, 0.6884)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1791
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7892
## Specificity : 0.6574
## Pos Pred Value : 0.1832
## Neg Pred Value : 0.9697
## Prevalence : 0.0887
## Detection Rate : 0.0700
## Detection Prevalence : 0.3822
## Balanced Accuracy : 0.7233
##
## 'Positive' Class : 1
##
#Only get the probabilities of training data
train_pred_probs <- predict(final_logistic_model, type = 'response') #we don't include 'newdata = bbtest' so it only gets probabilities of bbtrain.
#Select only numeric predictors
numerical_data <- bbtrain[, sapply(bbtrain, is.numeric)]
predictors <- colnames(bbtrain[, sapply(bbtrain, is.numeric)])
#Bind the logit and tidying the data for plot
numerical_data <- numerical_data |>
mutate(logit = log(train_pred_probs / (1 - train_pred_probs))) |>
gather(key = 'predictors', value = 'predictor.value', -logit)
#Create scatter plots
ggplot(numerical_data, aes(logit, predictor.value)) +
geom_point(size = 0.5, alpha = 0.5) +
geom_smooth(method = 'loess') +
theme_bw() +
facet_wrap(~predictors, scales = 'free_y')
## `geom_smooth()` using formula = 'y ~ x'
Apply transformations to First_purchase,
Last_purchase, and maybe amount_purchased and
P_Art, Frequency and P_Child
Polynomial Transformation: First_purchase, Last_purchase, Frequency, Amount Purchased
Log: All P_books
plot(final_logistic_model, which = 4)
#Extract model results
model.data <- augment(final_logistic_model) |>
mutate(index = 1:n())
#Data for the top 3 largest values according to cook's distance
model.data |>
top_n(2, .cooksd)
## # A tibble: 2 Ă— 18
## Choice Gender Amount_purchased Frequency Last_purchase First_purchase P_Child
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 273 30 9 68 1
## 2 0 1 267 14 12 72 5
## # ℹ 11 more variables: P_Youth <dbl>, P_Cook <dbl>, P_DIY <dbl>, P_Art <dbl>,
## # .fitted <dbl>, .resid <dbl>, .hat <dbl>, .sigma <dbl>, .cooksd <dbl>,
## # .std.resid <dbl>, index <int>
#Plot standardized residuals
ggplot(model.data, aes(index, .std.resid)) +
geom_point(aes(color = Choice), alpha = .5) +
theme_bw()
model.data |>
filter(abs(.std.resid) > 3)
## # A tibble: 0 Ă— 18
## # ℹ 18 variables: Choice <fct>, Gender <fct>, Amount_purchased <dbl>,
## # Frequency <dbl>, Last_purchase <dbl>, First_purchase <dbl>, P_Child <dbl>,
## # P_Youth <dbl>, P_Cook <dbl>, P_DIY <dbl>, P_Art <dbl>, .fitted <dbl>,
## # .resid <dbl>, .hat <dbl>, .sigma <dbl>, .cooksd <dbl>, .std.resid <dbl>,
## # index <int>
No influential observations in our data for this model.
hist(bbtrain$Frequency, breaks = 'Scott')
Frequency shows a heavily right skewed pattern, so this is a variable we could apply transformations to.
hist(bbtrain$Amount_purchased, breaks = 40)
Amount_purchased seems to have a fairly balanced
symmetry (0.084) compared to Frequency (1.037) which shows
a moderate positive skew.
Let’s just look at the skewness of all our data:
knitr::kable(apply(bbtrain[sapply(bbtrain, is.numeric)], 2, skewness, na.rm = TRUE))
| x | |
|---|---|
| Amount_purchased | 0.0849411 |
| Frequency | 1.0377839 |
| Last_purchase | 1.5221362 |
| First_purchase | 1.2872061 |
| P_Child | 1.9174050 |
| P_Youth | 2.1768945 |
| P_Cook | 1.6472399 |
| P_DIY | 1.8948099 |
| P_Art | 2.0095352 |
Skewness interpretation:
If skewness is near 0, the variable is approximatelly normally distributed.
If skewness is higher than 1 or lower than -1, the variable is highly skewed.
Almost all of our variables appear to be highly skewed to the right because they are higher than 1.
After reviewing skewness, we should apply transformations to all those highly skewed variables.
final_logistic_model2 <- glm(Choice ~ Gender + I(Amount_purchased^2) + I(Frequency^2) +
log(P_Child + 1) + log(P_Youth + 1) + log(P_Cook + 1) +
log(P_DIY + 1) + log(P_Art + 1) +
I(First_purchase^2),
data = bbtrain, family = 'binomial')
summary(final_logistic_model2)
##
## Call:
## glm(formula = Choice ~ Gender + I(Amount_purchased^2) + I(Frequency^2) +
## log(P_Child + 1) + log(P_Youth + 1) + log(P_Cook + 1) + log(P_DIY +
## 1) + log(P_Art + 1) + I(First_purchase^2), family = "binomial",
## data = bbtrain)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.201e-01 1.636e-01 -3.179 0.00148 **
## Gender1 -8.041e-01 1.337e-01 -6.016 1.79e-09 ***
## I(Amount_purchased^2) 5.497e-06 1.787e-06 3.077 0.00209 **
## I(Frequency^2) -2.983e-03 4.353e-04 -6.853 7.25e-12 ***
## log(P_Child + 1) -4.257e-01 1.585e-01 -2.686 0.00723 **
## log(P_Youth + 1) -1.502e-01 1.889e-01 -0.795 0.42663
## log(P_Cook + 1) -6.196e-01 1.611e-01 -3.847 0.00012 ***
## log(P_DIY + 1) -5.167e-01 1.905e-01 -2.712 0.00669 **
## log(P_Art + 1) 2.011e+00 1.769e-01 11.368 < 2e-16 ***
## I(First_purchase^2) 1.054e-04 1.038e-04 1.015 0.30987
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1799.5 on 1599 degrees of freedom
## Residual deviance: 1468.6 on 1590 degrees of freedom
## AIC: 1488.6
##
## Number of Fisher Scoring iterations: 5
vif(final_logistic_model2)
## Gender I(Amount_purchased^2) I(Frequency^2)
## 1.025287 1.303671 1.193533
## log(P_Child + 1) log(P_Youth + 1) log(P_Cook + 1)
## 1.486301 1.133428 1.533724
## log(P_DIY + 1) log(P_Art + 1) I(First_purchase^2)
## 1.278606 1.398183 3.120746
pred_probs <- predict(final_logistic_model2, newdata = bbtest, type = 'response')
pred_class <- ifelse(pred_probs > 0.5, 1, 0)
pred_class <- factor(pred_class)
# Confusion matrix
confusionMatrix(pred_class, bbtest$Choice, positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1969 128
## 1 127 76
##
## Accuracy : 0.8891
## 95% CI : (0.8756, 0.9017)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 0.9999
##
## Kappa : 0.3126
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.37255
## Specificity : 0.93941
## Pos Pred Value : 0.37438
## Neg Pred Value : 0.93896
## Prevalence : 0.08870
## Detection Rate : 0.03304
## Detection Prevalence : 0.08826
## Balanced Accuracy : 0.65598
##
## 'Positive' Class : 1
##
proc_logistic <- roc(bbtest$Choice, pred_probs)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(proc_logistic)
coords(proc_logistic, "best", ret = "threshold")
## threshold
## 1 0.2262459
Optimal threshold for Logistic Model with Transformations is
0.2262459
pred_probs <- predict(final_logistic_model2, newdata = bbtest, type = 'response')
pred_class <- ifelse(pred_probs > 0.2262459, 1, 0)
pred_class <- factor(pred_class)
# Confusion matrix
confusionMatrix(pred_class, bbtest$Choice, positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1483 49
## 1 613 155
##
## Accuracy : 0.7122
## 95% CI : (0.6932, 0.7306)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2079
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.75980
## Specificity : 0.70754
## Pos Pred Value : 0.20182
## Neg Pred Value : 0.96802
## Prevalence : 0.08870
## Detection Rate : 0.06739
## Detection Prevalence : 0.33391
## Balanced Accuracy : 0.73367
##
## 'Positive' Class : 1
##
After selecting the optimal threshold and comparing to the model with 0.5 threshold:
Logistic Model with NO Transformations
Accuracy goes down from 89.3% to 66.9%
Sensitivity goes up from 35.2% to 78.9%
Specificity goes down from 94.6% to 65.7%
Overall a pretty poor model as well.
Logistic Model with Transformations
Accuracy goes up from 66.9% to 71.2%
Sensitivity goes down from 78.9% to 75.9%
Specificity goes up from 65.7% to 70.7%
Model performance seems to go up.
#Only get the probabilities of training data
train_pred_probs <- predict(final_logistic_model2, type = 'response') #we don't include 'newdata = bbtest' so it only gets probabilities of bbtrain.
#Select only numeric predictors
numerical_data <- bbtrain[, sapply(bbtrain, is.numeric)]
predictors <- colnames(bbtrain[, sapply(bbtrain, is.numeric)])
#Bind the logit and tidying the data for plot
numerical_data <- numerical_data |>
mutate(logit = log(train_pred_probs / (1 - train_pred_probs))) |>
gather(key = 'predictors', value = 'predictor.value', -logit)
#Create scatter plots
ggplot(numerical_data, aes(logit, predictor.value)) +
geom_point(size = 0.5, alpha = 0.5) +
geom_smooth(method = 'loess') +
theme_bw() +
facet_wrap(~predictors, scales = 'free_y')
## `geom_smooth()` using formula = 'y ~ x'
Our transformations seem to have helped with a lot of the
relationship between our numeric predictors and Choice.
plot(final_logistic_model2, which = 4)
#Extract model results
model.data <- augment(final_logistic_model2) |>
mutate(index = 1:n())
#Data for the top 3 largest values according to cook's distance
model.data |>
top_n(2, .cooksd)
## # A tibble: 2 Ă— 17
## Choice Gender `I(Amount_purchased^2)` `I(Frequency^2)` `log(P_Child + 1)`
## <fct> <fct> <I<dbl>> <I<dbl>> <dbl>
## 1 1 1 1444 1156 0
## 2 1 1 74529 900 0.693
## # ℹ 12 more variables: `log(P_Youth + 1)` <dbl>, `log(P_Cook + 1)` <dbl>,
## # `log(P_DIY + 1)` <dbl>, `log(P_Art + 1)` <dbl>,
## # `I(First_purchase^2)` <I<dbl>>, .fitted <dbl>, .resid <dbl>, .hat <dbl>,
## # .sigma <dbl>, .cooksd <dbl>, .std.resid <dbl>, index <int>
#Plot standardized residuals
ggplot(model.data, aes(index, .std.resid)) +
geom_point(aes(color = Choice), alpha = .5) +
theme_bw()
model.data |>
filter(abs(.std.resid) > 3)
## # A tibble: 1 Ă— 17
## Choice Gender `I(Amount_purchased^2)` `I(Frequency^2)` `log(P_Child + 1)`
## <fct> <fct> <I<dbl>> <I<dbl>> <dbl>
## 1 1 1 1444 1156 0
## # ℹ 12 more variables: `log(P_Youth + 1)` <dbl>, `log(P_Cook + 1)` <dbl>,
## # `log(P_DIY + 1)` <dbl>, `log(P_Art + 1)` <dbl>,
## # `I(First_purchase^2)` <I<dbl>>, .fitted <dbl>, .resid <dbl>, .hat <dbl>,
## # .sigma <dbl>, .cooksd <dbl>, .std.resid <dbl>, index <int>
observation 247 is considered a problematic outlier
because it has an .std.resid higher than 3.
final_logistic_model3 <- glm(Choice ~ Gender + I(Amount_purchased^2) + I(Frequency^2) +
log(P_Child + 1) + log(P_Youth + 1) + log(P_Cook + 1) +
log(P_DIY + 1) + log(P_Art + 1) +
I(First_purchase^2),
data = bbtrain[-c(247), ], family = 'binomial')
summary(final_logistic_model3)
##
## Call:
## glm(formula = Choice ~ Gender + I(Amount_purchased^2) + I(Frequency^2) +
## log(P_Child + 1) + log(P_Youth + 1) + log(P_Cook + 1) + log(P_DIY +
## 1) + log(P_Art + 1) + I(First_purchase^2), family = "binomial",
## data = bbtrain[-c(247), ])
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.037e-01 1.644e-01 -3.064 0.00218 **
## Gender1 -8.137e-01 1.341e-01 -6.066 1.31e-09 ***
## I(Amount_purchased^2) 5.612e-06 1.793e-06 3.130 0.00175 **
## I(Frequency^2) -3.155e-03 4.499e-04 -7.013 2.33e-12 ***
## log(P_Child + 1) -4.268e-01 1.590e-01 -2.684 0.00727 **
## log(P_Youth + 1) -1.497e-01 1.895e-01 -0.790 0.42947
## log(P_Cook + 1) -6.216e-01 1.616e-01 -3.846 0.00012 ***
## log(P_DIY + 1) -5.183e-01 1.910e-01 -2.713 0.00666 **
## log(P_Art + 1) 2.019e+00 1.776e-01 11.372 < 2e-16 ***
## I(First_purchase^2) 1.073e-04 1.043e-04 1.029 0.30334
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1796.7 on 1598 degrees of freedom
## Residual deviance: 1459.1 on 1589 degrees of freedom
## AIC: 1479.1
##
## Number of Fisher Scoring iterations: 5
vif(final_logistic_model3)
## Gender I(Amount_purchased^2) I(Frequency^2)
## 1.025942 1.302993 1.186469
## log(P_Child + 1) log(P_Youth + 1) log(P_Cook + 1)
## 1.487593 1.133135 1.535315
## log(P_DIY + 1) log(P_Art + 1) I(First_purchase^2)
## 1.277891 1.398619 3.116404
pred_probs <- predict(final_logistic_model3, newdata = bbtest, type = 'response')
pred_class <- ifelse(pred_probs > 0.5, 1, 0)
pred_class <- factor(pred_class)
# Confusion matrix
confusionMatrix(pred_class, bbtest$Choice, positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1972 126
## 1 124 78
##
## Accuracy : 0.8913
## 95% CI : (0.8779, 0.9037)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 0.9995
##
## Kappa : 0.3246
##
## Mcnemar's Test P-Value : 0.9496
##
## Sensitivity : 0.38235
## Specificity : 0.94084
## Pos Pred Value : 0.38614
## Neg Pred Value : 0.93994
## Prevalence : 0.08870
## Detection Rate : 0.03391
## Detection Prevalence : 0.08783
## Balanced Accuracy : 0.66160
##
## 'Positive' Class : 1
##
proc_logistic <- roc(bbtest$Choice, pred_probs)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(proc_logistic)
coords(proc_logistic, "best", ret = "threshold")
## threshold
## 1 0.2223601
pred_probs <- predict(final_logistic_model3, newdata = bbtest, type = 'response')
pred_class <- ifelse(pred_probs > 0.2223601, 1, 0)
pred_class <- factor(pred_class)
# Confusion matrix
confusionMatrix(pred_class, bbtest$Choice, positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1458 47
## 1 638 157
##
## Accuracy : 0.7022
## 95% CI : (0.683, 0.7208)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2016
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.76961
## Specificity : 0.69561
## Pos Pred Value : 0.19748
## Neg Pred Value : 0.96877
## Prevalence : 0.08870
## Detection Rate : 0.06826
## Detection Prevalence : 0.34565
## Balanced Accuracy : 0.73261
##
## 'Positive' Class : 1
##
#Cutting data so it has same amount of 1's and 0's
#bbtrain Balancing
bbtrain_yes <- bbtrain |>
filter(Choice == 1)
bbtrain_no <- bbtrain |>
filter(Choice == 0)
set.seed(42)
bbtrain_no_sampled <- sample_n(bbtrain_no, nrow(bbtrain_yes))
bbtrain_balanced = rbind(bbtrain_yes, bbtrain_no_sampled)
full_balanced_log_model <- glm(Choice ~ ., data = bbtrain_balanced, family = 'binomial')
summary(full_balanced_log_model)
##
## Call:
## glm(formula = Choice ~ ., family = "binomial", data = bbtrain_balanced)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.760193 0.283609 2.680 0.007353 **
## Gender1 -0.885856 0.176986 -5.005 5.58e-07 ***
## Amount_purchased 0.001825 0.001015 1.799 0.072088 .
## Frequency -0.080390 0.022664 -3.547 0.000390 ***
## Last_purchase 0.687313 0.131995 5.207 1.92e-07 ***
## First_purchase -0.017341 0.018374 -0.944 0.345295
## P_Child -0.844436 0.156289 -5.403 6.55e-08 ***
## P_Youth -0.724135 0.198905 -3.641 0.000272 ***
## P_Cook -0.982831 0.157956 -6.222 4.90e-10 ***
## P_DIY -1.011396 0.191804 -5.273 1.34e-07 ***
## P_Art 0.840847 0.184287 4.563 5.05e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1109.04 on 799 degrees of freedom
## Residual deviance: 817.19 on 789 degrees of freedom
## AIC: 839.19
##
## Number of Fisher Scoring iterations: 5
vif(full_balanced_log_model)
## Gender Amount_purchased Frequency Last_purchase
## 1.021327 1.163255 2.995420 17.839132
## First_purchase P_Child P_Youth P_Cook
## 10.205951 3.165649 1.880310 3.248518
## P_DIY P_Art
## 1.895215 1.799390
Last_purchase and First_purchase are
multicolinear in the balanced dataset as well.
step_balanced_log_model <- step(full_balanced_log_model, direction = 'both',
trace = FALSE)
summary(step_balanced_log_model)
##
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency +
## Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art,
## family = "binomial", data = bbtrain_balanced)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.833144 0.272410 3.058 0.002225 **
## Gender1 -0.886744 0.176851 -5.014 5.33e-07 ***
## Amount_purchased 0.001806 0.001014 1.781 0.074972 .
## Frequency -0.097789 0.013333 -7.335 2.22e-13 ***
## Last_purchase 0.618936 0.109337 5.661 1.51e-08 ***
## P_Child -0.845959 0.156596 -5.402 6.58e-08 ***
## P_Youth -0.724551 0.199486 -3.632 0.000281 ***
## P_Cook -0.991582 0.157809 -6.283 3.31e-10 ***
## P_DIY -1.027780 0.191324 -5.372 7.79e-08 ***
## P_Art 0.811641 0.181397 4.474 7.66e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1109.04 on 799 degrees of freedom
## Residual deviance: 818.08 on 790 degrees of freedom
## AIC: 838.08
##
## Number of Fisher Scoring iterations: 5
vif(step_balanced_log_model)
## Gender Amount_purchased Frequency Last_purchase
## 1.021055 1.162367 1.037703 12.194326
## P_Child P_Youth P_Cook P_DIY
## 3.157336 1.880837 3.205273 1.896451
## P_Art
## 1.734382
Last_purchase has a VIF higher than 5 so we remove it
for our final stepwise model in the balanced dataset.
final_balanced_log_model <- glm(Choice ~ Gender + I(Amount_purchased^2) + I(Frequency^2) +
log(P_Child + 1) + log(P_Youth + 1) + log(P_Cook + 1) +
log(P_DIY + 1) + log(P_Art + 1) +
I(First_purchase^2),
data = bbtrain_balanced, family = 'binomial')
summary(final_balanced_log_model)
##
## Call:
## glm(formula = Choice ~ Gender + I(Amount_purchased^2) + I(Frequency^2) +
## log(P_Child + 1) + log(P_Youth + 1) + log(P_Cook + 1) + log(P_DIY +
## 1) + log(P_Art + 1) + I(First_purchase^2), family = "binomial",
## data = bbtrain_balanced)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.385e-01 2.096e-01 2.570 0.01017 *
## Gender1 -8.509e-01 1.715e-01 -4.960 7.06e-07 ***
## I(Amount_purchased^2) 6.627e-06 2.392e-06 2.770 0.00560 **
## I(Frequency^2) -3.050e-03 5.217e-04 -5.846 5.04e-09 ***
## log(P_Child + 1) -4.098e-01 2.019e-01 -2.030 0.04235 *
## log(P_Youth + 1) -1.211e-01 2.517e-01 -0.481 0.63039
## log(P_Cook + 1) -6.361e-01 2.071e-01 -3.071 0.00213 **
## log(P_DIY + 1) -6.784e-01 2.489e-01 -2.726 0.00642 **
## log(P_Art + 1) 2.234e+00 2.490e-01 8.972 < 2e-16 ***
## I(First_purchase^2) 1.693e-04 1.501e-04 1.128 0.25948
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1109.04 on 799 degrees of freedom
## Residual deviance: 866.77 on 790 degrees of freedom
## AIC: 886.77
##
## Number of Fisher Scoring iterations: 4
After removing Last_purchased, P_Youth
becomes statistically insignificant just like it did on our model in the
unbalanced dataset.
We don’t worry about this because it still contributes to the model in other metrics like AIC.
#Only get the probabilities of training data
train_pred_probs <- predict(final_balanced_log_model, type = 'response') #we don't include 'newdata = bbtest' so it only gets probabilities of bbtrain.
#Select only numeric predictors
numerical_data <- bbtrain_balanced[, sapply(bbtrain_balanced, is.numeric)]
predictors <- colnames(bbtrain_balanced[, sapply(bbtrain_balanced, is.numeric)])
#Bind the logit and tidying the data for plot
numerical_data <- numerical_data |>
mutate(logit = log(train_pred_probs / (1 - train_pred_probs))) |>
gather(key = 'predictors', value = 'predictor.value', -logit)
#Create scatter plots
ggplot(numerical_data, aes(logit, predictor.value)) +
geom_point(size = 0.5, alpha = 0.5) +
geom_smooth(method = 'loess') +
theme_bw() +
facet_wrap(~predictors, scales = 'free_y')
## `geom_smooth()` using formula = 'y ~ x'
pred_probs <- predict(final_balanced_log_model, newdata = bbtest, type = 'response')
pred_class <- ifelse(pred_probs > 0.5, 1, 0)
pred_class <- factor(pred_class)
# Confusion matrix
confusionMatrix(pred_class, bbtest$Choice, positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1539 61
## 1 557 143
##
## Accuracy : 0.7313
## 95% CI : (0.7127, 0.7493)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2075
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.70098
## Specificity : 0.73426
## Pos Pred Value : 0.20429
## Neg Pred Value : 0.96188
## Prevalence : 0.08870
## Detection Rate : 0.06217
## Detection Prevalence : 0.30435
## Balanced Accuracy : 0.71762
##
## 'Positive' Class : 1
##
proc_logistic_balanced <- roc(bbtest$Choice, pred_probs)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(proc_logistic_balanced)
coords(proc_logistic_balanced, "best", ret = "threshold")
## threshold
## 1 0.4356855
pred_probs <- predict(final_balanced_log_model, newdata = bbtest, type = 'response')
pred_class <- ifelse(pred_probs > 0.4356855, 1, 0)
pred_class <- factor(pred_class)
# Confusion matrix
conf_matrix <- confusionMatrix(pred_class, bbtest$Choice, positive = '1')
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1390 45
## 1 706 159
##
## Accuracy : 0.6735
## 95% CI : (0.6539, 0.6926)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1797
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.77941
## Specificity : 0.66317
## Pos Pred Value : 0.18382
## Neg Pred Value : 0.96864
## Prevalence : 0.08870
## Detection Rate : 0.06913
## Detection Prevalence : 0.37609
## Balanced Accuracy : 0.72129
##
## 'Positive' Class : 1
##
Balancing the dataset did not have a huge impact on our performance metrics.
Comparing to the final best model on the imbalanced dataset:
Accuracy improved from 66.9% to 67.3%
Sensitivity decreased from 78.9% to 77.9%
Specificity Improved from 65.7% to 66.3%
Model still doesn’t give us enough confidence.
tuned <- tune.svm(Choice ~ ., data = bbtrain, gamma = seq(.01, .1, by = .01),
cost = seq(.1, 1, by = .1))
tuned$best.parameters
## gamma cost
## 51 0.01 0.6
Out of all possible combinations, out best parameters for
gamma and cost are 0.01 and 0.6
tuned$performances
## gamma cost error dispersion
## 1 0.01 0.1 0.250000 0.02568506
## 2 0.02 0.1 0.248125 0.02733060
## 3 0.03 0.1 0.246250 0.02889757
## 4 0.04 0.1 0.242500 0.02898755
## 5 0.05 0.1 0.241875 0.02826739
## 6 0.06 0.1 0.240000 0.02719528
## 7 0.07 0.1 0.240000 0.02671220
## 8 0.08 0.1 0.241875 0.02500868
## 9 0.09 0.1 0.243125 0.02383574
## 10 0.10 0.1 0.243750 0.02357023
## 11 0.01 0.2 0.247500 0.02782635
## 12 0.02 0.2 0.228750 0.02520747
## 13 0.03 0.2 0.220000 0.02868652
## 14 0.04 0.2 0.216250 0.02889757
## 15 0.05 0.2 0.218125 0.02642633
## 16 0.06 0.2 0.218750 0.02779513
## 17 0.07 0.2 0.218750 0.02886751
## 18 0.08 0.2 0.220000 0.03087272
## 19 0.09 0.2 0.221250 0.03078826
## 20 0.10 0.2 0.220625 0.03146454
## 21 0.01 0.3 0.233750 0.02766993
## 22 0.02 0.3 0.215625 0.02783414
## 23 0.03 0.3 0.216875 0.02535340
## 24 0.04 0.3 0.216875 0.02552402
## 25 0.05 0.3 0.216875 0.02636055
## 26 0.06 0.3 0.218125 0.02675279
## 27 0.07 0.3 0.219375 0.02723515
## 28 0.08 0.3 0.221250 0.02978511
## 29 0.09 0.3 0.221250 0.03134707
## 30 0.10 0.3 0.222500 0.03078826
## 31 0.01 0.4 0.220625 0.02619538
## 32 0.02 0.4 0.215625 0.02469431
## 33 0.03 0.4 0.216250 0.02622022
## 34 0.04 0.4 0.215625 0.02622850
## 35 0.05 0.4 0.216250 0.02813657
## 36 0.06 0.4 0.217500 0.02838231
## 37 0.07 0.4 0.219375 0.02755203
## 38 0.08 0.4 0.220625 0.02619538
## 39 0.09 0.4 0.223750 0.02581989
## 40 0.10 0.4 0.226250 0.02648375
## 41 0.01 0.5 0.214375 0.02602916
## 42 0.02 0.5 0.213125 0.02507800
## 43 0.03 0.5 0.213750 0.02513851
## 44 0.04 0.5 0.215625 0.02736234
## 45 0.05 0.5 0.216875 0.02748895
## 46 0.06 0.5 0.217500 0.02598744
## 47 0.07 0.5 0.220000 0.02791978
## 48 0.08 0.5 0.221875 0.02538762
## 49 0.09 0.5 0.221875 0.02451792
## 50 0.10 0.5 0.224375 0.02707532
## 51 0.01 0.6 0.211250 0.02443813
## 52 0.02 0.6 0.213125 0.02507800
## 53 0.03 0.6 0.213750 0.02598744
## 54 0.04 0.6 0.218125 0.02832874
## 55 0.05 0.6 0.218125 0.02770912
## 56 0.06 0.6 0.220000 0.02791978
## 57 0.07 0.6 0.221875 0.02572727
## 58 0.08 0.6 0.222500 0.02415229
## 59 0.09 0.6 0.223125 0.02636055
## 60 0.10 0.6 0.223750 0.02631935
## 61 0.01 0.7 0.213125 0.02309589
## 62 0.02 0.7 0.213750 0.02531057
## 63 0.03 0.7 0.215000 0.02571883
## 64 0.04 0.7 0.218750 0.02667968
## 65 0.05 0.7 0.218750 0.02795085
## 66 0.06 0.7 0.220625 0.02636055
## 67 0.07 0.7 0.220000 0.02615392
## 68 0.08 0.7 0.222500 0.02468552
## 69 0.09 0.7 0.222500 0.02638523
## 70 0.10 0.7 0.220625 0.02811342
## 71 0.01 0.8 0.213750 0.02443813
## 72 0.02 0.8 0.214375 0.02535340
## 73 0.03 0.8 0.215000 0.02503470
## 74 0.04 0.8 0.220000 0.02713137
## 75 0.05 0.8 0.220625 0.02701112
## 76 0.06 0.8 0.221250 0.02554952
## 77 0.07 0.8 0.220625 0.02602916
## 78 0.08 0.8 0.221250 0.02450907
## 79 0.09 0.8 0.220000 0.02776389
## 80 0.10 0.8 0.218125 0.02923355
## 81 0.01 0.9 0.214375 0.02483452
## 82 0.02 0.9 0.215000 0.02622022
## 83 0.03 0.9 0.216250 0.02537907
## 84 0.04 0.9 0.219375 0.02707532
## 85 0.05 0.9 0.221250 0.02671220
## 86 0.06 0.9 0.221875 0.02655739
## 87 0.07 0.9 0.221250 0.02554952
## 88 0.08 0.9 0.220625 0.02636055
## 89 0.09 0.9 0.217500 0.02822897
## 90 0.10 0.9 0.217500 0.02913689
## 91 0.01 1.0 0.214375 0.02465913
## 92 0.02 1.0 0.215000 0.02622022
## 93 0.03 1.0 0.218125 0.02576099
## 94 0.04 1.0 0.220000 0.02631935
## 95 0.05 1.0 0.221875 0.02688227
## 96 0.06 1.0 0.221250 0.02703521
## 97 0.07 1.0 0.222500 0.02735441
## 98 0.08 1.0 0.218125 0.02893509
## 99 0.09 1.0 0.216875 0.02887503
## 100 0.10 1.0 0.219375 0.02817511
Here we can corroborate that our best parameters are indeed 0.01 and 0.6 because in that iteration both our error and our dispersion are at the lowest.
svm_model = svm(Choice ~ ., data = bbtrain, gamma = tuned$best.parameters$gamma,
cost = tuned$best.parameters$cost)
summary(svm_model)
##
## Call:
## svm(formula = Choice ~ ., data = bbtrain, gamma = tuned$best.parameters$gamma,
## cost = tuned$best.parameters$cost)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 0.6
##
## Number of Support Vectors: 792
##
## ( 392 400 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
svmpredict = predict(svm_model, bbtest, type = 'response')
confusionMatrix(svmpredict, bbtest$Choice, positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2064 176
## 1 32 28
##
## Accuracy : 0.9096
## 95% CI : (0.8971, 0.921)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 0.6327
##
## Kappa : 0.179
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.13725
## Specificity : 0.98473
## Pos Pred Value : 0.46667
## Neg Pred Value : 0.92143
## Prevalence : 0.08870
## Detection Rate : 0.01217
## Detection Prevalence : 0.02609
## Balanced Accuracy : 0.56099
##
## 'Positive' Class : 1
##
Compute Error
mean((as.numeric(svmpredict) - as.numeric(bbtest$Choice))^2)
## [1] 0.09043478
tuned_balanced <- tune.svm(Choice ~ ., data = bbtrain_balanced, gamma = seq(.01, .1, by = .01),
cost = seq(.1, 1, by = .1))
tuned_balanced$best.parameters
## gamma cost
## 13 0.03 0.2
tuned_balanced$performances
## gamma cost error dispersion
## 1 0.01 0.1 0.33625 0.03747684
## 2 0.02 0.1 0.31375 0.04143687
## 3 0.03 0.1 0.28750 0.04082483
## 4 0.04 0.1 0.28125 0.04135299
## 5 0.05 0.1 0.28000 0.04090979
## 6 0.06 0.1 0.27875 0.05036326
## 7 0.07 0.1 0.28500 0.04706674
## 8 0.08 0.1 0.29000 0.04993051
## 9 0.09 0.1 0.29500 0.04721405
## 10 0.10 0.1 0.29750 0.04706674
## 11 0.01 0.2 0.30500 0.03496029
## 12 0.02 0.2 0.28125 0.04497299
## 13 0.03 0.2 0.27125 0.05529278
## 14 0.04 0.2 0.27375 0.04581439
## 15 0.05 0.2 0.27875 0.04372023
## 16 0.06 0.2 0.27625 0.04619178
## 17 0.07 0.2 0.28000 0.04417453
## 18 0.08 0.2 0.28500 0.04923018
## 19 0.09 0.2 0.28750 0.05170697
## 20 0.10 0.2 0.28750 0.04930066
## 21 0.01 0.3 0.28375 0.04489571
## 22 0.02 0.3 0.27500 0.05773503
## 23 0.03 0.3 0.27625 0.05696307
## 24 0.04 0.3 0.28375 0.04788949
## 25 0.05 0.3 0.27875 0.04896498
## 26 0.06 0.3 0.28000 0.04571956
## 27 0.07 0.3 0.28375 0.04788949
## 28 0.08 0.3 0.28500 0.04479893
## 29 0.09 0.3 0.28500 0.04706674
## 30 0.10 0.3 0.28125 0.04686342
## 31 0.01 0.4 0.28000 0.04456581
## 32 0.02 0.4 0.27625 0.05541823
## 33 0.03 0.4 0.27750 0.04594683
## 34 0.04 0.4 0.27875 0.04641674
## 35 0.05 0.4 0.28375 0.04372023
## 36 0.06 0.4 0.28375 0.04168749
## 37 0.07 0.4 0.28125 0.04093101
## 38 0.08 0.4 0.28000 0.04133199
## 39 0.09 0.4 0.28125 0.04340139
## 40 0.10 0.4 0.28250 0.04216370
## 41 0.01 0.5 0.27750 0.04779877
## 42 0.02 0.5 0.27500 0.05170697
## 43 0.03 0.5 0.27875 0.05272110
## 44 0.04 0.5 0.28625 0.04016027
## 45 0.05 0.5 0.28250 0.04216370
## 46 0.06 0.5 0.28125 0.04259385
## 47 0.07 0.5 0.28000 0.04174992
## 48 0.08 0.5 0.27750 0.03987829
## 49 0.09 0.5 0.28000 0.04830459
## 50 0.10 0.5 0.28500 0.04816061
## 51 0.01 0.6 0.27875 0.05239076
## 52 0.02 0.6 0.27375 0.05118390
## 53 0.03 0.6 0.28000 0.04495368
## 54 0.04 0.6 0.28750 0.04409586
## 55 0.05 0.6 0.28375 0.04291869
## 56 0.06 0.6 0.28125 0.04379958
## 57 0.07 0.6 0.27875 0.04528076
## 58 0.08 0.6 0.28250 0.04721405
## 59 0.09 0.6 0.28625 0.04945888
## 60 0.10 0.6 0.28625 0.04767147
## 61 0.01 0.7 0.27500 0.05559027
## 62 0.02 0.7 0.27625 0.04730589
## 63 0.03 0.7 0.28000 0.04417453
## 64 0.04 0.7 0.28125 0.04007372
## 65 0.05 0.7 0.28125 0.04340139
## 66 0.06 0.7 0.28125 0.04093101
## 67 0.07 0.7 0.28375 0.04715886
## 68 0.08 0.7 0.28500 0.04923018
## 69 0.09 0.7 0.28750 0.04787136
## 70 0.10 0.7 0.28625 0.04656611
## 71 0.01 0.8 0.27375 0.05285265
## 72 0.02 0.8 0.27625 0.04656611
## 73 0.03 0.8 0.27750 0.04923018
## 74 0.04 0.8 0.27750 0.03944053
## 75 0.05 0.8 0.28000 0.04005205
## 76 0.06 0.8 0.28375 0.04126894
## 77 0.07 0.8 0.28500 0.04706674
## 78 0.08 0.8 0.28625 0.04875178
## 79 0.09 0.8 0.29000 0.04958158
## 80 0.10 0.8 0.29000 0.04958158
## 81 0.01 0.9 0.27375 0.05876330
## 82 0.02 0.9 0.27500 0.04714045
## 83 0.03 0.9 0.27500 0.04564355
## 84 0.04 0.9 0.27875 0.03821086
## 85 0.05 0.9 0.27875 0.04126894
## 86 0.06 0.9 0.28125 0.04007372
## 87 0.07 0.9 0.28500 0.04993051
## 88 0.08 0.9 0.28875 0.04803428
## 89 0.09 0.9 0.29000 0.04958158
## 90 0.10 0.9 0.29250 0.04794383
## 91 0.01 1.0 0.27500 0.05559027
## 92 0.02 1.0 0.27250 0.04322101
## 93 0.03 1.0 0.27875 0.04678927
## 94 0.04 1.0 0.28250 0.03872983
## 95 0.05 1.0 0.27750 0.03899786
## 96 0.06 1.0 0.27625 0.04387878
## 97 0.07 1.0 0.28750 0.04602234
## 98 0.08 1.0 0.29125 0.04825065
## 99 0.09 1.0 0.29500 0.04758034
## 100 0.10 1.0 0.29500 0.04794383
svm_balanced_model = svm(Choice ~ ., data = bbtrain_balanced,
gamma = tuned_balanced$best.parameters$gamma,
cost = tuned_balanced$best.parameters$cost)
summary(svm_balanced_model)
##
## Call:
## svm(formula = Choice ~ ., data = bbtrain_balanced, gamma = tuned_balanced$best.parameters$gamma,
## cost = tuned_balanced$best.parameters$cost)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 0.2
##
## Number of Support Vectors: 634
##
## ( 316 318 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
svmpredict_balanced <- predict(svm_balanced_model, bbtest, type = 'response')
confusionMatrix(svmpredict_balanced, bbtest$Choice, positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1625 66
## 1 471 138
##
## Accuracy : 0.7665
## 95% CI : (0.7487, 0.7837)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2383
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6765
## Specificity : 0.7753
## Pos Pred Value : 0.2266
## Neg Pred Value : 0.9610
## Prevalence : 0.0887
## Detection Rate : 0.0600
## Detection Prevalence : 0.2648
## Balanced Accuracy : 0.7259
##
## 'Positive' Class : 1
##
Compute Error
mean((as.numeric(svmpredict_balanced) - as.numeric(bbtest$Choice))^2)
## [1] 0.2334783