library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
dive_df <- read.csv("date-channel-sessions-cr.csv")
my project data set didn’t have good time series data, it only had a month column. The month column is useful for my predictive models, but leaves out days and years, and only includes 10 months. This info could not be used in this exercise to form a time series analysis though. Here is the distribution of the month variable from my project data, for comparison.
So instead I found a similar data set with a proper ‘date’ feature that also has a target variable related to conversion rate like mine. Many e-commerse data sets are more related to sales amounts rather than a considering the amount visiting the site that did not make purchases. To accomplish exploring this aspect of e-commerce data, this data set was the best of what i could find.
It includes date, default channel (direct, organic search, referral, etc.), sessions (some form of number of pages or duration on the site), and conversion rate
summary(dive_df)
## Date Default.Channel.Grouping Sessions
## Min. :20170924 Length:4954 Length:4954
## 1st Qu.:20180516 Class :character Class :character
## Median :20181214 Mode :character Mode :character
## Mean :20185316
## 3rd Qu.:20190809
## Max. :20200330
## Ecommerce.Conversion.Rate
## Length:4954
## Class :character
## Mode :character
##
##
##
First, I convert the ‘date’ feature to a date type in R.
# Convert the 'date' column to Date type
dive_df$Date <- as.Date(as.character(dive_df$Date), format = "%Y%m%d")
# Check the structure of the column
str(dive_df$Date)
## Date[1:4954], format: "2020-03-08" "2020-03-08" "2020-03-08" "2020-03-08" "2019-11-12" ...
# Fix conversion rate variable
dive_df$Ecommerce.Conversion.Rate <- as.numeric(gsub("%", "", dive_df$Ecommerce.Conversion.Rate)) / 100
str(dive_df$Ecommerce.Conversion.Rate)
## num [1:4954] 0.0632 0.059 0.0561 0.0518 0.0476 0.0454 0.0452 0.0412 0.0408 0.0389 ...
Next i’ll plot them all and include a trendline
date_counts <- dive_df |>
group_by(Date) |>
summarise(count = n())
ggplot(data = date_counts, aes(x = Date, y = count)) +
geom_line(color = "black", alpha = 0.7) +
geom_smooth(method = "loess", se = FALSE, color = "blue", span = 0.2) +
labs(title = "Frequency of Instances Over Time",
x = "Date",
y = "Count of Instances") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
next will group up all the days that are the same and create a single theoretical year. This way we can understand seasonal trends better.
library(dplyr)
library(ggplot2)
# Extract month and day to create a "theoretical year"
date_counts$month_day <- format(date_counts$Date, "%m-%d")
aggregated_counts <- dive_df |>
mutate(month_day = format(Date, "%m-%d")) |>
group_by(month_day) |>
summarise(
count = n(), # Count of instances
ConversionRate = mean(Ecommerce.Conversion.Rate, na.rm = TRUE) # Average Conversion Rate for each month_day
)
# Convert 'month_day' back to a date for plotting
aggregated_counts$month_day <- as.Date(aggregated_counts$month_day, format = "%m-%d")
# Plot the time series for the theoretical year
ggplot(data = aggregated_counts, aes(x = month_day, y = count)) +
geom_line(color = "blue") +
labs(title = "Frequency of Instances in a Single Theoretical Year",
x = "Date (Month-Day)",
y = "Count of Instances") +
theme_minimal()
Now ill use a geom_smooth() trendline with a lowered ‘span’ so the line will fit the data more tightly. I also added extra vertical lines to show when each month changes.
# Create a list of dates for the 1st of each month to create lines for the plot
month_starts <- seq(as.Date("2024-01-01"), as.Date("2024-12-01"), by = "1 month")
ggplot(data = aggregated_counts, aes(x = month_day, y = count)) +
geom_line( color = "gray", alpha = 0.5, size = 1) + # Original line
geom_smooth(method = "loess", span = 0.1, se = FALSE, color = "darkolivegreen3", size = 1.5) + # Smoothed line
# Add vertical lines for each month
geom_vline(xintercept = as.numeric(month_starts), color = "darkolivegreen2", size = .5) +
# Customize x-axis ticks to show every 3 months
scale_x_date(breaks = as.Date(c("2024-01-01", "2024-04-01", "2024-07-01", "2024-10-01")),
labels = c("January", "April", "July", "October")) +
labs(title = "Smoothed Frequency of Instances in a Single Theoretical Year",
x = "Date (Month-Day)",
y = "Count of Instances") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
Now i’ll look at the conversion rate over the same period.
# Create a new plot for Conversion Rate with the same formatting
ggplot(data = aggregated_counts, aes(x = month_day, y = ConversionRate)) +
geom_line(color = "gray", alpha = 0.5, size = 1) + # Original Conversion Rate line
geom_smooth(method = "loess", span = 0.1, se = FALSE, color = "darkolivegreen3", size = 1.5) + # Smoothed Conversion Rate line
# Add vertical lines for each month
geom_vline(xintercept = as.numeric(month_starts), color = "darkolivegreen2", size = 0.5) +
# Customize x-axis ticks to show every 3 months
scale_x_date(breaks = as.Date(c("2024-01-01", "2024-04-01", "2024-07-01", "2024-10-01")),
labels = c("January", "April", "July", "October")) +
# Customize y-axis labels as percentages
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
labs(
title = "Smoothed Conversion Rate in a Single Theoretical Year",
x = "Date (Month-Day)",
y = "Conversion Rate (%)"
) +
theme_minimal() +
theme(
axis.text.y = element_text(color = "gray"),
axis.title.y = element_text(color = "gray"),
panel.grid.minor = element_blank()
)
## `geom_smooth()` using formula = 'y ~ x'
# Load required libraries
library(forecast)
## Warning: package 'forecast' was built under R version 4.3.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# Fit the ARIMA model
arima_model <- auto.arima(aggregated_counts$count, seasonal = FALSE)
# Print the summary of the ARIMA model
summary(arima_model)
## Series: aggregated_counts$count
## ARIMA(2,1,1)
##
## Coefficients:
## ar1 ar2 ma1
## 0.3569 0.1648 -0.8543
## s.e. 0.0819 0.0669 0.0595
##
## sigma^2 = 2.034: log likelihood = -646.21
## AIC=1300.42 AICc=1300.53 BIC=1316.02
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.01871708 1.418361 1.024954 -1.148268 8.412741 1.016599
## ACF1
## Training set 0.008216618
# Diagnostic plots
checkresiduals(arima_model)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,1,1)
## Q* = 6.7888, df = 7, p-value = 0.4512
##
## Model df: 3. Total lags used: 10
# Forecast the next 12 months
forecasted_values <- forecast(arima_model, h = 12)
# Plot the forecast
plot(forecasted_values, main = "ARIMA Model Forecast", xlab = "Date (Month-Day)", ylab = "Count of Instances")
Dataset Insights:
The dataset confirms the seasonal trends in e-commerce that were noticed from our project data, with clear peaks during high-activity periods (e.g., November).
Conversion rates and user activity are influenced by external factors such as promotions, holidays, and marketing campaigns.
In the distribution from our project data set the proportion of “True” revenue instances is noticeably higher in November, suggesting high customer conversion in that month. This conversion rate spike is also seen in our new data set around late november and early december
Methodology:
Aggregating and smoothing data helped identify patterns, especially in creating a single year for better seasonality analysis.
The ARIMA model, while effective for trend analysis, needs adjustments (e.g., SARIMA) to incorporate the seasonal component observed in the data.
Business Applications:
Predicting future activity requires models that consider both trends and seasonal cycles.
Insights from such analysis can guide resource allocation (e.g., inventory, marketing) to capitalize on peak periods.