General Discussion:
This analysis provides insight into e-commerce behavior using web
optimization, marketing timing and user retention. Some improvements are
that the data set lacks demographic and pricing information, limiting
some of the explanatory power within the model. Some variables are also
skewed, which may distort effect elements. Incorporating interaction
terms, cross-validation techniques, or supervised techniques can also be
used to yield better results.
set.seed(321)
knitr::opts_chunk$set(message = FALSE, warning = FALSE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(car)
## Warning: package 'car' was built under R version 4.4.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.2
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(ResourceSelection)
## Warning: package 'ResourceSelection' was built under R version 4.4.3
## ResourceSelection 0.3-6 2023-06-27
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(pander)
## Warning: package 'pander' was built under R version 4.4.3
data_path <- "C:/Users/rg03/Downloads/sta321/online_shoppers_intention.csv"
dat <- read.csv(data_path, stringsAsFactors = FALSE)
dat$Revenue <- factor(dat$Revenue, levels = c(FALSE, TRUE), labels = c("No", "Yes"))
glimpse(dat)
## Rows: 12,330
## Columns: 18
## $ Administrative <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2…
## $ Administrative_Duration <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5…
## $ Informational <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Informational_Duration <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ProductRelated <int> 1, 2, 1, 2, 10, 19, 1, 0, 2, 3, 3, 16, 7, 6, 2…
## $ ProductRelated_Duration <dbl> 0.000000, 64.000000, 0.000000, 2.666667, 627.5…
## $ BounceRates <dbl> 0.200000000, 0.000000000, 0.200000000, 0.05000…
## $ ExitRates <dbl> 0.200000000, 0.100000000, 0.200000000, 0.14000…
## $ PageValues <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ SpecialDay <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4, 0.0, 0.8, 0…
## $ Month <chr> "Feb", "Feb", "Feb", "Feb", "Feb", "Feb", "Feb…
## $ OperatingSystems <int> 1, 2, 4, 3, 3, 2, 2, 1, 2, 2, 1, 1, 1, 2, 3, 1…
## $ Browser <int> 1, 2, 1, 2, 3, 2, 4, 2, 2, 4, 1, 1, 1, 5, 2, 1…
## $ Region <int> 1, 1, 9, 2, 1, 1, 3, 1, 2, 1, 3, 4, 1, 1, 3, 9…
## $ TrafficType <int> 1, 2, 3, 4, 4, 3, 3, 5, 3, 2, 3, 3, 3, 3, 3, 3…
## $ VisitorType <chr> "Returning_Visitor", "Returning_Visitor", "Ret…
## $ Weekend <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE…
## $ Revenue <fct> No, No, No, No, No, No, No, No, No, No, No, No…
summary(dat$Revenue)
## No Yes
## 10422 1908
use_vars <- c(
"Administrative_Duration", "Informational_Duration", "ProductRelated_Duration",
"BounceRates", "ExitRates", "PageValues",
"Month", "VisitorType", "Weekend"
)
dat2 <- dat %>%
dplyr::select(Revenue, dplyr::all_of(use_vars)) %>%
na.omit()
dat2 <- dat2 %>%
mutate(
Month = factor(Month),
VisitorType = factor(VisitorType),
Weekend = factor(Weekend)
)
summary(dat2)
## Revenue Administrative_Duration Informational_Duration
## No :10422 Min. : 0.00 Min. : 0.00
## Yes: 1908 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 7.50 Median : 0.00
## Mean : 80.82 Mean : 34.47
## 3rd Qu.: 93.26 3rd Qu.: 0.00
## Max. :3398.75 Max. :2549.38
##
## ProductRelated_Duration BounceRates ExitRates PageValues
## Min. : 0.0 Min. :0.000000 Min. :0.00000 Min. : 0.000
## 1st Qu.: 184.1 1st Qu.:0.000000 1st Qu.:0.01429 1st Qu.: 0.000
## Median : 598.9 Median :0.003112 Median :0.02516 Median : 0.000
## Mean : 1194.8 Mean :0.022191 Mean :0.04307 Mean : 5.889
## 3rd Qu.: 1464.2 3rd Qu.:0.016813 3rd Qu.:0.05000 3rd Qu.: 0.000
## Max. :63973.5 Max. :0.200000 Max. :0.20000 Max. :361.764
##
## Month VisitorType Weekend
## May :3364 New_Visitor : 1694 FALSE:9462
## Nov :2998 Other : 85 TRUE :2868
## Mar :1907 Returning_Visitor:10551
## Dec :1727
## Oct : 549
## Sep : 448
## (Other):1337
pairs(
~ Administrative_Duration + Informational_Duration +
ProductRelated_Duration + BounceRates + ExitRates + PageValues,
data = dat2,
main = "Scatterplot Matrix of Continuous Predictors"
)

num_df <- dat2 %>% dplyr::select(where(is.numeric))
if (ncol(num_df) > 1) {
round(cor(num_df), 2)
}
## Administrative_Duration Informational_Duration
## Administrative_Duration 1.00 0.24
## Informational_Duration 0.24 1.00
## ProductRelated_Duration 0.36 0.35
## BounceRates -0.14 -0.07
## ExitRates -0.21 -0.11
## PageValues 0.07 0.03
## ProductRelated_Duration BounceRates ExitRates
## Administrative_Duration 0.36 -0.14 -0.21
## Informational_Duration 0.35 -0.07 -0.11
## ProductRelated_Duration 1.00 -0.18 -0.25
## BounceRates -0.18 1.00 0.91
## ExitRates -0.25 0.91 1.00
## PageValues 0.05 -0.12 -0.17
## PageValues
## Administrative_Duration 0.07
## Informational_Duration 0.03
## ProductRelated_Duration 0.05
## BounceRates -0.12
## ExitRates -0.17
## PageValues 1.00
full.model <- glm(
Revenue ~ Administrative_Duration + Informational_Duration +
ProductRelated_Duration + BounceRates + ExitRates +
PageValues + Month + VisitorType + Weekend,
data = dat2, family = binomial(link = "logit")
)
pander(summary(full.model)$coef,
caption = "Full Model — Inferential Statistics")
Full Model — Inferential Statistics (continued below)
| (Intercept) |
-1.656 |
0.1739 |
-9.527 |
| Administrative_Duration |
-7.046e-05 |
0.0001634 |
-0.4311 |
| Informational_Duration |
0.000219 |
0.0001822 |
1.202 |
| ProductRelated_Duration |
0.0001013 |
1.465e-05 |
6.914 |
| BounceRates |
-3.808 |
3.252 |
-1.171 |
| ExitRates |
-16.2 |
2.347 |
-6.905 |
| PageValues |
0.08223 |
0.002402 |
34.23 |
| MonthDec |
-0.6181 |
0.1818 |
-3.401 |
| MonthFeb |
-1.81 |
0.6354 |
-2.848 |
| MonthJul |
0.07887 |
0.2183 |
0.3613 |
| MonthJune |
-0.3071 |
0.2751 |
-1.116 |
| MonthMar |
-0.5278 |
0.1794 |
-2.942 |
| MonthMay |
-0.5877 |
0.1692 |
-3.474 |
| MonthNov |
0.5368 |
0.1622 |
3.309 |
| MonthOct |
-0.01127 |
0.2016 |
-0.05589 |
| MonthSep |
-0.01799 |
0.2123 |
-0.0847 |
| VisitorTypeOther |
-0.5364 |
0.5245 |
-1.023 |
| VisitorTypeReturning_Visitor |
-0.3129 |
0.08512 |
-3.676 |
| WeekendTRUE |
0.09889 |
0.07086 |
1.396 |
| (Intercept) |
1.614e-21 |
| Administrative_Duration |
0.6664 |
| Informational_Duration |
0.2293 |
| ProductRelated_Duration |
4.726e-12 |
| BounceRates |
0.2416 |
| ExitRates |
5.011e-12 |
| PageValues |
8.675e-257 |
| MonthDec |
0.0006725 |
| MonthFeb |
0.004398 |
| MonthJul |
0.7179 |
| MonthJune |
0.2643 |
| MonthMar |
0.003256 |
| MonthMay |
0.0005134 |
| MonthNov |
0.000935 |
| MonthOct |
0.9554 |
| MonthSep |
0.9325 |
| VisitorTypeOther |
0.3064 |
| VisitorTypeReturning_Visitor |
0.0002369 |
| WeekendTRUE |
0.1628 |
vif(full.model)
## GVIF Df GVIF^(1/(2*Df))
## Administrative_Duration 1.158832 1 1.076491
## Informational_Duration 1.144643 1 1.069880
## ProductRelated_Duration 1.328126 1 1.152444
## BounceRates 1.984528 1 1.408733
## ExitRates 2.102475 1 1.449991
## PageValues 1.061579 1 1.030330
## Month 1.107538 9 1.005691
## VisitorType 1.128430 2 1.030668
## Weekend 1.011496 1 1.005732
reduced.model <- glm(
Revenue ~ BounceRates + ExitRates + PageValues + VisitorType,
data = dat2, family = binomial(link = "logit")
)
pander(summary(reduced.model)$coef,
caption = "Reduced Model — Inferential Statistics")
Reduced Model — Inferential Statistics
| (Intercept) |
-1.593 |
0.08097 |
-19.67 |
3.976e-86 |
| BounceRates |
-0.1066 |
3.106 |
-0.03431 |
0.9726 |
| ExitRates |
-21.26 |
2.256 |
-9.422 |
4.415e-21 |
| PageValues |
0.07907 |
0.002317 |
34.13 |
2.792e-255 |
| VisitorTypeOther |
-0.6119 |
0.5099 |
-1.2 |
0.2301 |
| VisitorTypeReturning_Visitor |
-0.1581 |
0.08121 |
-1.947 |
0.05154 |
final.model <- stepAIC(
reduced.model,
scope = list(lower = formula(reduced.model), upper = formula(full.model)),
direction = "forward",
trace = 0
)
pander(summary(final.model)$coef,
caption = "Final Model (Stepwise Forward) — Inferential Statistics")
Final Model (Stepwise Forward) — Inferential
Statistics
| (Intercept) |
-1.666 |
0.1729 |
-9.636 |
5.66e-22 |
| BounceRates |
-3.788 |
3.245 |
-1.168 |
0.243 |
| ExitRates |
-16.17 |
2.336 |
-6.921 |
4.5e-12 |
| PageValues |
0.08227 |
0.002399 |
34.29 |
1.14e-257 |
| VisitorTypeOther |
-0.5388 |
0.5252 |
-1.026 |
0.3049 |
| VisitorTypeReturning_Visitor |
-0.3099 |
0.08507 |
-3.643 |
0.0002692 |
| MonthDec |
-0.6126 |
0.1817 |
-3.372 |
0.0007462 |
| MonthFeb |
-1.812 |
0.6357 |
-2.85 |
0.004373 |
| MonthJul |
0.08542 |
0.2182 |
0.3915 |
0.6954 |
| MonthJune |
-0.309 |
0.2752 |
-1.123 |
0.2614 |
| MonthMar |
-0.5232 |
0.1793 |
-2.917 |
0.003533 |
| MonthMay |
-0.5849 |
0.1692 |
-3.458 |
0.0005451 |
| MonthNov |
0.5389 |
0.1621 |
3.324 |
0.0008889 |
| MonthOct |
-0.009271 |
0.2015 |
-0.04602 |
0.9633 |
| MonthSep |
-0.01598 |
0.2123 |
-0.07526 |
0.94 |
| ProductRelated_Duration |
0.0001047 |
1.342e-05 |
7.796 |
6.395e-15 |
| WeekendTRUE |
0.1017 |
0.0708 |
1.437 |
0.1508 |
global.measure <- function(m) {
cbind(
Deviance = m$deviance,
Null.Deviance = m$null.deviance,
AIC = m$aic,
DF = df.residual(m)
)
}
goodness <- rbind(
Full = global.measure(full.model),
Reduced = global.measure(reduced.model),
Final = global.measure(final.model)
)
pander(goodness, caption = "Global Goodness-of-Fit: Deviance & AIC")
Global Goodness-of-Fit: Deviance & AIC
| 7167 |
10625 |
7205 |
12311 |
| 7551 |
10625 |
7563 |
12324 |
| 7169 |
10625 |
7203 |
12313 |
coef_tab <- summary(final.model)$coef
odds <- exp(coef(final.model))
out <- cbind(coef_tab, `Odds Ratio` = odds)
pander(out, digits = 4, caption = "Final Model Coefficients with Odds Ratios")
Final Model Coefficients with Odds Ratios (continued
below)
| (Intercept) |
-1.666 |
0.1729 |
-9.636 |
| BounceRates |
-3.788 |
3.245 |
-1.168 |
| ExitRates |
-16.17 |
2.336 |
-6.921 |
| PageValues |
0.08227 |
0.002399 |
34.29 |
| VisitorTypeOther |
-0.5388 |
0.5252 |
-1.026 |
| VisitorTypeReturning_Visitor |
-0.3099 |
0.08507 |
-3.643 |
| MonthDec |
-0.6126 |
0.1817 |
-3.372 |
| MonthFeb |
-1.812 |
0.6357 |
-2.85 |
| MonthJul |
0.08542 |
0.2182 |
0.3915 |
| MonthJune |
-0.309 |
0.2752 |
-1.123 |
| MonthMar |
-0.5232 |
0.1793 |
-2.917 |
| MonthMay |
-0.5849 |
0.1692 |
-3.458 |
| MonthNov |
0.5389 |
0.1621 |
3.324 |
| MonthOct |
-0.009271 |
0.2015 |
-0.04602 |
| MonthSep |
-0.01598 |
0.2123 |
-0.07526 |
| ProductRelated_Duration |
0.0001047 |
1.342e-05 |
7.796 |
| WeekendTRUE |
0.1017 |
0.0708 |
1.437 |
| (Intercept) |
5.66e-22 |
0.189 |
| BounceRates |
0.243 |
0.02264 |
| ExitRates |
4.5e-12 |
9.504e-08 |
| PageValues |
1.14e-257 |
1.086 |
| VisitorTypeOther |
0.3049 |
0.5834 |
| VisitorTypeReturning_Visitor |
0.0002692 |
0.7335 |
| MonthDec |
0.0007462 |
0.5419 |
| MonthFeb |
0.004373 |
0.1634 |
| MonthJul |
0.6954 |
1.089 |
| MonthJune |
0.2614 |
0.7342 |
| MonthMar |
0.003533 |
0.5926 |
| MonthMay |
0.0005451 |
0.5572 |
| MonthNov |
0.0008889 |
1.714 |
| MonthOct |
0.9633 |
0.9908 |
| MonthSep |
0.94 |
0.9841 |
| ProductRelated_Duration |
6.395e-15 |
1 |
| WeekendTRUE |
0.1508 |
1.107 |
phat <- fitted(final.model)
roc_obj <- roc(dat2$Revenue, phat)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
plot(roc_obj, main = "ROC Curve — Final Logistic Model")

auc_val <- auc(roc_obj)
auc_val
## Area under the curve: 0.8969
y_num <- as.numeric(dat2$Revenue) - 1
hoslem.test(y_num, phat, g = 10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: y_num, phat
## X-squared = 180.45, df = 8, p-value < 2.2e-16