library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
project_data <- read.csv("online_shoppers_intention.csv")
First, create 5 random samples from our data that are 50%:
set.seed(123) # Setting seed for reproducibility
df1 <- project_data |> sample_frac(0.5)
df2 <- project_data |> sample_frac(0.5)
df3 <- project_data |> sample_frac(0.5)
df4 <- project_data |> sample_frac(0.5)
df5 <- project_data |> sample_frac(0.5)
# Display the dimensions of the samples to confirm size
dim(df1)
## [1] 6165 18
dim(df2)
## [1] 6165 18
dim(df3)
## [1] 6165 18
dim(df4)
## [1] 6165 18
dim(df5)
## [1] 6165 18
Now let’s get a summary of the 5 sets:
# Define a function to summarize each subset, filtering out "Other" visitor types
summarize_subset <- function(df) {
df |>
filter(VisitorType != "Other") |> # Remove instances where VisitorType is "Other"
group_by(Revenue, VisitorType) |>
summarize(
count = n(),
avg_exit_rate = mean(ExitRates, na.rm = TRUE),
sd_exit_rate = sd(ExitRates, na.rm = TRUE),
avg_page_value = mean(PageValues, na.rm = TRUE),
sd_page_value = sd(PageValues, na.rm = TRUE),
.groups = "drop" # Drop groups after summarizing for cleaner output
)
}
# Apply the function to each subset
df1_summary <- summarize_subset(df1)
df2_summary <- summarize_subset(df2)
df3_summary <- summarize_subset(df3)
df4_summary <- summarize_subset(df4)
df5_summary <- summarize_subset(df5)
# View summaries for comparison
df1_summary
## # A tibble: 4 × 7
## Revenue VisitorType count avg_exit_rate sd_exit_rate avg_page_value
## <lgl> <chr> <int> <dbl> <dbl> <dbl>
## 1 FALSE New_Visitor 635 0.0236 0.0328 1.82
## 2 FALSE Returning_Visitor 4540 0.0510 0.0520 1.97
## 3 TRUE New_Visitor 213 0.0131 0.0114 37.5
## 4 TRUE Returning_Visitor 735 0.0207 0.0166 24.7
## # ℹ 1 more variable: sd_page_value <dbl>
df2_summary
## # A tibble: 4 × 7
## Revenue VisitorType count avg_exit_rate sd_exit_rate avg_page_value
## <lgl> <chr> <int> <dbl> <dbl> <dbl>
## 1 FALSE New_Visitor 625 0.0233 0.0336 1.28
## 2 FALSE Returning_Visitor 4545 0.0498 0.0516 2.23
## 3 TRUE New_Visitor 194 0.0142 0.0122 38.0
## 4 TRUE Returning_Visitor 752 0.0207 0.0161 23.3
## # ℹ 1 more variable: sd_page_value <dbl>
df3_summary
## # A tibble: 4 × 7
## Revenue VisitorType count avg_exit_rate sd_exit_rate avg_page_value
## <lgl> <chr> <int> <dbl> <dbl> <dbl>
## 1 FALSE New_Visitor 625 0.0232 0.0342 1.18
## 2 FALSE Returning_Visitor 4545 0.0504 0.0522 2.04
## 3 TRUE New_Visitor 200 0.0136 0.0115 35.2
## 4 TRUE Returning_Visitor 748 0.0212 0.0180 23.1
## # ℹ 1 more variable: sd_page_value <dbl>
df4_summary
## # A tibble: 4 × 7
## Revenue VisitorType count avg_exit_rate sd_exit_rate avg_page_value
## <lgl> <chr> <int> <dbl> <dbl> <dbl>
## 1 FALSE New_Visitor 613 0.0232 0.0335 1.11
## 2 FALSE Returning_Visitor 4567 0.0512 0.0524 2.13
## 3 TRUE New_Visitor 221 0.0144 0.0119 42.6
## 4 TRUE Returning_Visitor 718 0.0207 0.0152 24.0
## # ℹ 1 more variable: sd_page_value <dbl>
df5_summary
## # A tibble: 4 × 7
## Revenue VisitorType count avg_exit_rate sd_exit_rate avg_page_value
## <lgl> <chr> <int> <dbl> <dbl> <dbl>
## 1 FALSE New_Visitor 622 0.0239 0.0355 1.18
## 2 FALSE Returning_Visitor 4565 0.0511 0.0531 2.06
## 3 TRUE New_Visitor 208 0.0137 0.0113 40.2
## 4 TRUE Returning_Visitor 721 0.0210 0.0153 23.0
## # ℹ 1 more variable: sd_page_value <dbl>
False (No Purchase)
Counts for New_Visitor range from 613 to 635 across the five subsets.
Counts for Returning_Visitor are significantly higher, ranging from 4,540 to 4,567.
True (Purchase)
Counts for New_Visitor range from 194 to 221.
Counts for Returning_Visitor range from 735 to 752.
New Visitors:
The average exit rates for New_Visitor hover around 0.013 to 0.014 when a purchase is made, and around 0.023 to 0.024 when not.
Standard deviations for Avg_Exit_Rate are fairly consistent across subsets, indicating a similar level of variance.
Returning Visitors:
Average exit rates for Returning_Visitor range from approximately 0.0207 to 0.0212 when a purchase is made, and around 0.049 to 0.051 when not.
The exit rate variability is slightly higher compared to New_Visitor, but still relatively stable across the subsets.
New Visitors:
The average page value for New_Visitor when purchases are made ranges from 35.19 to 42.58 across subsets, indicating that those who do convert see a significantly higher page value.
The standard deviations also show considerable variation, suggesting differences in behavior.
Returning Visitors:
Average page values when purchases are made range from 23.00 to 24.71 and show lower values than New_Visitor but consistent behavior across subsets.
The standard deviations are relatively consistent, indicating uniformity in purchasing behavior among returning visitors.
Anomalies:
Consistency:
Impact of Visitor Type:
Now we’ll visualize the comparison of the 5 subsets.
combined_data <- rbind(
df1_summary |> mutate(subset = "Subset 1"),
df2_summary |> mutate(subset = "Subset 2"),
df3_summary |> mutate(subset = "Subset 3"),
df4_summary |> mutate(subset = "Subset 4"),
df5_summary |> mutate(subset = "Subset 5")
)
# Visualization for Exit Rates
ggplot(combined_data, aes(x = subset, y = avg_exit_rate, fill = VisitorType)) +
geom_boxplot() +
labs(title = "Average Exit Rate by Subset and Visitor Type",
x = "Subset",
y = "Average Exit Rate") +
theme_minimal()
# Visualization for Page Values
ggplot(combined_data, aes(x = subset, y = avg_page_value, fill = VisitorType)) +
geom_boxplot() +
labs(title = "Average Page Value by Subset and Visitor Type",
x = "Subset",
y = "Average Page Value") +
theme_minimal()
# Visualization for counts of each VisitorType
ggplot(combined_data, aes(x = subset, y = count, fill = VisitorType)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Visitor Counts by Subset and Visitor Type",
x = "Subset",
y = "Count") +
theme_minimal()