Future Improvements
Future work could include interaction terms and more advanced models,
like XGBoost, CatBoost, LightGBM, or a Random Forest, to try and better
capture non-linear relationships.
set.seed(321)
knitr::opts_chunk$set(message = FALSE, warning = FALSE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(car)
## Warning: package 'car' was built under R version 4.4.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.2
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(ResourceSelection)
## Warning: package 'ResourceSelection' was built under R version 4.4.3
## ResourceSelection 0.3-6 2023-06-27
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(pander)
## Warning: package 'pander' was built under R version 4.4.3
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
data_path <-"C:/Users/rg03/Downloads/sta321/online_shoppers_intention.csv"
dat <- read.csv(data_path, stringsAsFactors = FALSE)
dat$Revenue <- factor(dat$Revenue, levels = c(FALSE, TRUE), labels = c("No", "Yes"))
glimpse(dat)
## Rows: 12,330
## Columns: 18
## $ Administrative <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2…
## $ Administrative_Duration <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5…
## $ Informational <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Informational_Duration <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ProductRelated <int> 1, 2, 1, 2, 10, 19, 1, 0, 2, 3, 3, 16, 7, 6, 2…
## $ ProductRelated_Duration <dbl> 0.000000, 64.000000, 0.000000, 2.666667, 627.5…
## $ BounceRates <dbl> 0.200000000, 0.000000000, 0.200000000, 0.05000…
## $ ExitRates <dbl> 0.200000000, 0.100000000, 0.200000000, 0.14000…
## $ PageValues <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ SpecialDay <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4, 0.0, 0.8, 0…
## $ Month <chr> "Feb", "Feb", "Feb", "Feb", "Feb", "Feb", "Feb…
## $ OperatingSystems <int> 1, 2, 4, 3, 3, 2, 2, 1, 2, 2, 1, 1, 1, 2, 3, 1…
## $ Browser <int> 1, 2, 1, 2, 3, 2, 4, 2, 2, 4, 1, 1, 1, 5, 2, 1…
## $ Region <int> 1, 1, 9, 2, 1, 1, 3, 1, 2, 1, 3, 4, 1, 1, 3, 9…
## $ TrafficType <int> 1, 2, 3, 4, 4, 3, 3, 5, 3, 2, 3, 3, 3, 3, 3, 3…
## $ VisitorType <chr> "Returning_Visitor", "Returning_Visitor", "Ret…
## $ Weekend <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE…
## $ Revenue <fct> No, No, No, No, No, No, No, No, No, No, No, No…
summary(dat$Revenue)
## No Yes
## 10422 1908
use_vars <- c(
"Administrative_Duration", "Informational_Duration", "ProductRelated_Duration",
"BounceRates", "ExitRates", "PageValues",
"Month", "VisitorType", "Weekend"
)
dat2 <- dat %>%
dplyr::select(Revenue, dplyr::all_of(use_vars)) %>%
na.omit() %>%
mutate(
Month = factor(Month),
VisitorType = factor(VisitorType),
Weekend = factor(Weekend)
)
summary(dat2)
## Revenue Administrative_Duration Informational_Duration
## No :10422 Min. : 0.00 Min. : 0.00
## Yes: 1908 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 7.50 Median : 0.00
## Mean : 80.82 Mean : 34.47
## 3rd Qu.: 93.26 3rd Qu.: 0.00
## Max. :3398.75 Max. :2549.38
##
## ProductRelated_Duration BounceRates ExitRates PageValues
## Min. : 0.0 Min. :0.000000 Min. :0.00000 Min. : 0.000
## 1st Qu.: 184.1 1st Qu.:0.000000 1st Qu.:0.01429 1st Qu.: 0.000
## Median : 598.9 Median :0.003112 Median :0.02516 Median : 0.000
## Mean : 1194.8 Mean :0.022191 Mean :0.04307 Mean : 5.889
## 3rd Qu.: 1464.2 3rd Qu.:0.016813 3rd Qu.:0.05000 3rd Qu.: 0.000
## Max. :63973.5 Max. :0.200000 Max. :0.20000 Max. :361.764
##
## Month VisitorType Weekend
## May :3364 New_Visitor : 1694 FALSE:9462
## Nov :2998 Other : 85 TRUE :2868
## Mar :1907 Returning_Visitor:10551
## Dec :1727
## Oct : 549
## Sep : 448
## (Other):1337
pairs(
~ Administrative_Duration + Informational_Duration +
ProductRelated_Duration + BounceRates + ExitRates + PageValues,
data = dat2,
main = "Scatterplot Matrix of Continuous Predictors"
)

num_df <- dat2 %>% dplyr::select(where(is.numeric))
if (ncol(num_df) > 1) {
round(cor(num_df), 2)
}
## Administrative_Duration Informational_Duration
## Administrative_Duration 1.00 0.24
## Informational_Duration 0.24 1.00
## ProductRelated_Duration 0.36 0.35
## BounceRates -0.14 -0.07
## ExitRates -0.21 -0.11
## PageValues 0.07 0.03
## ProductRelated_Duration BounceRates ExitRates
## Administrative_Duration 0.36 -0.14 -0.21
## Informational_Duration 0.35 -0.07 -0.11
## ProductRelated_Duration 1.00 -0.18 -0.25
## BounceRates -0.18 1.00 0.91
## ExitRates -0.25 0.91 1.00
## PageValues 0.05 -0.12 -0.17
## PageValues
## Administrative_Duration 0.07
## Informational_Duration 0.03
## ProductRelated_Duration 0.05
## BounceRates -0.12
## ExitRates -0.17
## PageValues 1.00
num_vars <- c("Administrative_Duration","Informational_Duration","ProductRelated_Duration",
"BounceRates","ExitRates","PageValues")
safe_pagevalues_tertile <- function(x) {
ux <- unique(x[is.finite(x)])
if (length(ux) >= 3) {
factor(dplyr::ntile(x, 3), labels = c("Low","Mid","High"))
} else {
med <- median(x, na.rm = TRUE)
factor(ifelse(x <= med, "Low", "High"), levels = c("Low","High"))
}
}
dat3 <- dat2 %>%
mutate(
across(all_of(num_vars), ~ as.numeric(scale(.x)), .names = "{.col}_z"),
PageValues_tertile = safe_pagevalues_tertile(PageValues)
)
pander(data.frame(Standardized = paste0(num_vars, "_z")),
caption = "Numerical Variables Standardized (z-scores)")
Numerical Variables Standardized (z-scores)
| Administrative_Duration_z |
| Informational_Duration_z |
| ProductRelated_Duration_z |
| BounceRates_z |
| ExitRates_z |
| PageValues_z |
pander(table(dat3$PageValues_tertile), caption = "Discretized Variable: PageValues (safe tertiles/median split)")
Discretized Variable: PageValues (safe tertiles/median
split)
| 4110 |
4110 |
4110 |
set.seed(321)
idx <- createDataPartition(dat3$Revenue, p = 0.70, list = FALSE)
train <- dat3[idx, ]
test <- dat3[-idx, ]
pander(rbind(Train = prop.table(table(train$Revenue)),
Test = prop.table(table(test$Revenue))),
caption = "Stratified Split: Class Proportions (Revenue)")
Stratified Split: Class Proportions (Revenue)
| Train |
0.8452 |
0.1548 |
| Test |
0.8453 |
0.1547 |
full.model <- glm(
Revenue ~ Administrative_Duration_z + Informational_Duration_z +
ProductRelated_Duration_z + BounceRates_z + ExitRates_z +
PageValues_z + Month + VisitorType + Weekend,
data = train, family = binomial(link = "logit")
)
pander(summary(full.model)$coef, caption = "Full Model (Train) — Inferential Statistics")
Full Model (Train) — Inferential Statistics
| (Intercept) |
-1.908 |
0.2169 |
-8.796 |
1.412e-18 |
| Administrative_Duration_z |
0.0259 |
0.03352 |
0.7726 |
0.4397 |
| Informational_Duration_z |
0.03611 |
0.02992 |
1.207 |
0.2275 |
| ProductRelated_Duration_z |
0.2041 |
0.03402 |
5.999 |
1.985e-09 |
| BounceRates_z |
-0.2185 |
0.1833 |
-1.192 |
0.2332 |
| ExitRates_z |
-0.67 |
0.1347 |
-4.973 |
6.598e-07 |
| PageValues_z |
1.594 |
0.05489 |
29.04 |
1.97e-185 |
| MonthDec |
-0.4409 |
0.2233 |
-1.975 |
0.0483 |
| MonthFeb |
-1.825 |
0.7998 |
-2.281 |
0.02253 |
| MonthJul |
0.07092 |
0.2728 |
0.2599 |
0.7949 |
| MonthJune |
-0.4791 |
0.357 |
-1.342 |
0.1796 |
| MonthMar |
-0.3442 |
0.2196 |
-1.567 |
0.1171 |
| MonthMay |
-0.4549 |
0.2087 |
-2.18 |
0.02929 |
| MonthNov |
0.6967 |
0.2011 |
3.464 |
0.0005313 |
| MonthOct |
0.1461 |
0.2456 |
0.595 |
0.5518 |
| MonthSep |
0.1604 |
0.262 |
0.6122 |
0.5404 |
| VisitorTypeOther |
-1.237 |
0.7403 |
-1.671 |
0.09469 |
| VisitorTypeReturning_Visitor |
-0.3463 |
0.1027 |
-3.373 |
0.0007432 |
| WeekendTRUE |
0.07555 |
0.08529 |
0.8858 |
0.3757 |
pander(vif(full.model), caption = "VIF — Full Model")
VIF — Full Model
| Administrative_Duration_z |
1.161 |
1 |
1.077 |
| Informational_Duration_z |
1.142 |
1 |
1.068 |
| ProductRelated_Duration_z |
1.332 |
1 |
1.154 |
| BounceRates_z |
2.023 |
1 |
1.422 |
| ExitRates_z |
2.149 |
1 |
1.466 |
| PageValues_z |
1.064 |
1 |
1.031 |
| Month |
1.103 |
9 |
1.005 |
| VisitorType |
1.126 |
2 |
1.03 |
| Weekend |
1.011 |
1 |
1.006 |
reduced.model <- glm(
Revenue ~ BounceRates_z + ExitRates_z + PageValues_z + VisitorType,
data = train, family = binomial(link = "logit")
)
pander(summary(reduced.model)$coef, caption = "Reduced Model (Train) — Inferential Statistics")
Reduced Model (Train) — Inferential Statistics
| (Intercept) |
-2.007 |
0.1041 |
-19.29 |
6.543e-83 |
| BounceRates_z |
-0.02422 |
0.176 |
-0.1376 |
0.8905 |
| ExitRates_z |
-0.9555 |
0.1299 |
-7.353 |
1.937e-13 |
| PageValues_z |
1.528 |
0.05267 |
29.01 |
5.482e-185 |
| VisitorTypeOther |
-1.164 |
0.714 |
-1.63 |
0.1031 |
| VisitorTypeReturning_Visitor |
-0.1751 |
0.09759 |
-1.794 |
0.07278 |
final.model <- stepAIC(
reduced.model,
scope = list(lower = formula(reduced.model), upper = formula(full.model)),
direction = "forward",
trace = 0
)
pander(summary(final.model)$coef, caption = "Stepwise Forward Model (Train) — Inferential Statistics")
Stepwise Forward Model (Train) — Inferential
Statistics
| (Intercept) |
-1.889 |
0.216 |
-8.744 |
2.242e-18 |
| BounceRates_z |
-0.2048 |
0.183 |
-1.119 |
0.2633 |
| ExitRates_z |
-0.6872 |
0.1343 |
-5.116 |
3.128e-07 |
| PageValues_z |
1.596 |
0.05488 |
29.08 |
7.394e-186 |
| VisitorTypeOther |
-1.25 |
0.7434 |
-1.682 |
0.09257 |
| VisitorTypeReturning_Visitor |
-0.3482 |
0.1026 |
-3.395 |
0.0006875 |
| MonthDec |
-0.4375 |
0.2235 |
-1.958 |
0.05023 |
| MonthFeb |
-1.842 |
0.801 |
-2.299 |
0.02148 |
| MonthJul |
0.07393 |
0.2728 |
0.2711 |
0.7863 |
| MonthJune |
-0.4998 |
0.3574 |
-1.398 |
0.162 |
| MonthMar |
-0.3352 |
0.2198 |
-1.525 |
0.1272 |
| MonthMay |
-0.4528 |
0.209 |
-2.167 |
0.03025 |
| MonthNov |
0.699 |
0.2013 |
3.473 |
0.0005153 |
| MonthOct |
0.1548 |
0.2457 |
0.6302 |
0.5286 |
| MonthSep |
0.1574 |
0.2624 |
0.5998 |
0.5486 |
| ProductRelated_Duration_z |
0.224 |
0.03086 |
7.258 |
3.923e-13 |
discrete.model <- glm(
Revenue ~ PageValues_tertile + BounceRates_z + ExitRates_z + VisitorType + Month + Weekend,
data = train, family = binomial(link = "logit")
)
pander(summary(discrete.model)$coef, caption = "Discrete Variant (Train) — Inferential Statistics")
Discrete Variant (Train) — Inferential Statistics
| (Intercept) |
-19.6 |
189.4 |
-0.1035 |
0.9176 |
| PageValues_tertileMid |
17.28 |
189.4 |
0.09123 |
0.9273 |
| PageValues_tertileHigh |
19.45 |
189.4 |
0.1027 |
0.9182 |
| BounceRates_z |
-0.1847 |
0.2034 |
-0.9079 |
0.3639 |
| ExitRates_z |
-0.9386 |
0.1396 |
-6.724 |
1.763e-11 |
| VisitorTypeOther |
-0.2927 |
0.406 |
-0.721 |
0.4709 |
| VisitorTypeReturning_Visitor |
-0.7019 |
0.09673 |
-7.256 |
3.992e-13 |
| MonthDec |
-0.9812 |
0.201 |
-4.88 |
1.06e-06 |
| MonthFeb |
0.05682 |
0.9402 |
0.06044 |
0.9518 |
| MonthJul |
-0.03925 |
0.2537 |
-0.1547 |
0.877 |
| MonthJune |
-0.5072 |
0.3373 |
-1.503 |
0.1327 |
| MonthMar |
0.9172 |
0.2315 |
3.961 |
7.464e-05 |
| MonthMay |
0.06206 |
0.1966 |
0.3157 |
0.7522 |
| MonthNov |
0.07288 |
0.1847 |
0.3945 |
0.6932 |
| MonthOct |
0.1282 |
0.2234 |
0.5741 |
0.5659 |
| MonthSep |
0.1357 |
0.2401 |
0.5652 |
0.5719 |
| WeekendTRUE |
-0.02751 |
0.08263 |
-0.3329 |
0.7392 |
global.measure <- function(m) {
cbind(
Deviance = m$deviance,
Null.Deviance = m$null.deviance,
AIC = m$aic,
DF = df.residual(m)
)
}
goodness <- rbind(
Full = global.measure(full.model),
Reduced = global.measure(reduced.model),
Stepwise = global.measure(final.model),
Discrete = global.measure(discrete.model)
)
pander(goodness, caption = "Global Goodness-of-Fit on Train: Deviance, Null Deviance, AIC, DF")
Global Goodness-of-Fit on Train: Deviance, Null Deviance, AIC,
DF
| 4980 |
7439 |
5018 |
8613 |
| 5259 |
7439 |
5271 |
8626 |
| 4983 |
7439 |
5015 |
8616 |
| 4772 |
7439 |
4806 |
8615 |
set.seed(321)
ctrl <- trainControl(
method = "cv", number = 5,
classProbs = TRUE, summaryFunction = twoClassSummary,
savePredictions = "final"
)
cv_full <- train(formula(full.model), data = train, method = "glm",
family = binomial(), trControl = ctrl, metric = "ROC")
cv_reduced <- train(formula(reduced.model), data = train, method = "glm",
family = binomial(), trControl = ctrl, metric = "ROC")
cv_step <- train(formula(final.model), data = train, method = "glm",
family = binomial(), trControl = ctrl, metric = "ROC")
cv_discrete <- train(formula(discrete.model), data = train, method = "glm",
family = binomial(), trControl = ctrl, metric = "ROC")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
cv_tab <- tibble(
Model = c("Full (z)","Reduced","Stepwise","Discrete"),
CV_AUC = c(max(cv_full$results$ROC),
max(cv_reduced$results$ROC),
max(cv_step$results$ROC),
max(cv_discrete$results$ROC))
) %>% arrange(desc(CV_AUC))
pander(cv_tab, caption = "5-Fold CV ROC (AUC) — Higher is Better")
5-Fold CV ROC (AUC) — Higher is Better
| Full (z) |
0.893 |
| Stepwise |
0.8918 |
| Discrete |
0.8914 |
| Reduced |
0.8666 |
aic_tab <- tibble(
Model = c("Full (z)","Reduced","Stepwise","Discrete"),
AIC = c(AIC(full.model), AIC(reduced.model), AIC(final.model), AIC(discrete.model))
) %>% arrange(AIC)
pander(aic_tab, caption = "AIC on Training Data — Lower is Better")
AIC on Training Data — Lower is Better
| Discrete |
4806 |
| Stepwise |
5015 |
| Full (z) |
5018 |
| Reduced |
5271 |
final_name <- cv_tab$Model[1]
chosen_model <- switch(final_name,
"Full (z)" = full.model,
"Reduced" = reduced.model,
"Stepwise" = final.model,
"Discrete" = discrete.model
)
cat("**Final Model Selected by CV AUC:**", final_name, "\n\n")
## **Final Model Selected by CV AUC:** Full (z)
phat <- predict(chosen_model, newdata = test, type = "response")
pred <- factor(ifelse(phat >= 0.5, "Yes", "No"), levels = c("No","Yes"))
cm <- confusionMatrix(pred, test$Revenue, positive = "Yes")
cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 3039 341
## Yes 87 231
##
## Accuracy : 0.8843
## 95% CI : (0.8735, 0.8944)
## No Information Rate : 0.8453
## P-Value [Acc > NIR] : 6.318e-12
##
## Kappa : 0.4593
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.40385
## Specificity : 0.97217
## Pos Pred Value : 0.72642
## Neg Pred Value : 0.89911
## Prevalence : 0.15468
## Detection Rate : 0.06247
## Detection Prevalence : 0.08599
## Balanced Accuracy : 0.68801
##
## 'Positive' Class : Yes
##
roc_obj <- roc(response = test$Revenue, predictor = phat, levels = c("No","Yes"))
## Setting direction: controls < cases
plot(roc_obj, main = paste("ROC — Final Model on Test (", final_name, ")"))

auc_val <- auc(roc_obj)
auc_val
## Area under the curve: 0.9022
y_num <- as.numeric(test$Revenue) - 1
hoslem.test(y_num, phat, g = 10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: y_num, phat
## X-squared = 62.186, df = 8, p-value = 1.734e-10
coef_tab <- summary(chosen_model)$coef
odds <- exp(coef(chosen_model))
out <- cbind(coef_tab, `Odds Ratio` = odds)
pander(out, digits = 4, caption = paste("Final Model Coefficients with Odds Ratios —", final_name))
Final Model Coefficients with Odds Ratios — Full (z) (continued
below)
| (Intercept) |
-1.908 |
0.2169 |
-8.796 |
1.412e-18 |
| Administrative_Duration_z |
0.0259 |
0.03352 |
0.7726 |
0.4397 |
| Informational_Duration_z |
0.03611 |
0.02992 |
1.207 |
0.2275 |
| ProductRelated_Duration_z |
0.2041 |
0.03402 |
5.999 |
1.985e-09 |
| BounceRates_z |
-0.2185 |
0.1833 |
-1.192 |
0.2332 |
| ExitRates_z |
-0.67 |
0.1347 |
-4.973 |
6.598e-07 |
| PageValues_z |
1.594 |
0.05489 |
29.04 |
1.97e-185 |
| MonthDec |
-0.4409 |
0.2233 |
-1.975 |
0.0483 |
| MonthFeb |
-1.825 |
0.7998 |
-2.281 |
0.02253 |
| MonthJul |
0.07092 |
0.2728 |
0.2599 |
0.7949 |
| MonthJune |
-0.4791 |
0.357 |
-1.342 |
0.1796 |
| MonthMar |
-0.3442 |
0.2196 |
-1.567 |
0.1171 |
| MonthMay |
-0.4549 |
0.2087 |
-2.18 |
0.02929 |
| MonthNov |
0.6967 |
0.2011 |
3.464 |
0.0005313 |
| MonthOct |
0.1461 |
0.2456 |
0.595 |
0.5518 |
| MonthSep |
0.1604 |
0.262 |
0.6122 |
0.5404 |
| VisitorTypeOther |
-1.237 |
0.7403 |
-1.671 |
0.09469 |
| VisitorTypeReturning_Visitor |
-0.3463 |
0.1027 |
-3.373 |
0.0007432 |
| WeekendTRUE |
0.07555 |
0.08529 |
0.8858 |
0.3757 |
| (Intercept) |
0.1483 |
| Administrative_Duration_z |
1.026 |
| Informational_Duration_z |
1.037 |
| ProductRelated_Duration_z |
1.226 |
| BounceRates_z |
0.8037 |
| ExitRates_z |
0.5117 |
| PageValues_z |
4.923 |
| MonthDec |
0.6434 |
| MonthFeb |
0.1613 |
| MonthJul |
1.073 |
| MonthJune |
0.6194 |
| MonthMar |
0.7088 |
| MonthMay |
0.6345 |
| MonthNov |
2.007 |
| MonthOct |
1.157 |
| MonthSep |
1.174 |
| VisitorTypeOther |
0.2902 |
| VisitorTypeReturning_Visitor |
0.7073 |
| WeekendTRUE |
1.078 |