library(data.table)
## Warning: package 'data.table' was built under R version 4.4.3
DT <- fread("clean_dat.csv") # generate as data.table so operations are faster
load("/Users/isaiahmireles/Downloads/sample_summary_stats.RData")
Orders with order_shipped are considered
successful
Orders below 60 days are considered ongoing
Orders at 60 days are considered ongoing
Successful_Journey Featuresetorder(DT, id, event_timestamp)
DT by each id from their earliest to
latest time event_timestampDT[, Successful_Journey := as.integer(event_name[.N] == "order_shipped"), by = id]
id, looking at the last event, is it
order_shipped?library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
DT |>
count(Successful_Journey) |>
mutate(percent = n / sum(n))
## Successful_Journey n percent
## <int> <int> <num>
## 1: 0 36680999 0.7074601
## 2: 1 15167862 0.2925399
70% unsuccessful/ongoing
30% successful
# for each id ordered from earliest to latest event
setorder(DT, id, event_timestamp)
# final date observed in the whole dataset
dataset_end <- max(DT$event_timestamp)
# 1 = ongoing, 0 = not ongoing
DT[, ongoing := as.integer(
Successful_Journey == 0 &
as.numeric(dataset_end - event_timestamp[.N], units = "days") < 60
), by = id]
events <- unique(DT$event_name)
event_classification <- data.frame(
event_name = events,
initiated_by = c(
"website_initiated", # application_web_approved
"user_initiated", # browse_products
"user_initiated", # application_web_view
"user_initiated", # campaign_click
"user_initiated", # add_to_cart
"user_initiated", # view_cart
"user_initiated", # begin_checkout
"website_initiated", # promotion_created
"website_initiated", # catalog_(mail)
"user_initiated", # application_web_submit
"website_initiated", # account_activitation
"user_initiated", # place_order_web
"user_initiated", # campaignemail_clicked
"website_initiated", # application_phone_approved
"user_initiated", # pre-application_(3rd_party_affiliates)
"user_initiated", # place_downpayment
"website_initiated", # account_downpaymentcleared
"website_initiated", # order_shipped
"user_initiated", # place_order_phone
"website_initiated", # application_web_declined
"user_initiated", # site_registration
"website_initiated", # account_downpaymentreceived
"website_initiated", # catalog_(email)_(experian)
"website_initiated", # application_phone_declined
"user_initiated", # fingerhut_university
"website_initiated", # application_web_pending
"user_initiated" # customer_requested_catalog_(digital)
),
stringsAsFactors = FALSE
)
event_classification
## event_name initiated_by
## 1 application_web_approved website_initiated
## 2 browse_products user_initiated
## 3 application_web_view user_initiated
## 4 campaign_click user_initiated
## 5 add_to_cart user_initiated
## 6 view_cart user_initiated
## 7 begin_checkout user_initiated
## 8 promotion_created website_initiated
## 9 catalog_(mail) website_initiated
## 10 application_web_submit user_initiated
## 11 account_activitation website_initiated
## 12 place_order_web user_initiated
## 13 campaignemail_clicked user_initiated
## 14 application_phone_approved website_initiated
## 15 pre-application_(3rd_party_affiliates) user_initiated
## 16 place_downpayment user_initiated
## 17 account_downpaymentcleared website_initiated
## 18 order_shipped website_initiated
## 19 place_order_phone user_initiated
## 20 application_web_declined website_initiated
## 21 site_registration user_initiated
## 22 account_downpaymentreceived website_initiated
## 23 catalog_(email)_(experian) website_initiated
## 24 application_phone_declined website_initiated
## 25 fingerhut_university user_initiated
## 26 application_web_pending website_initiated
## 27 customer_requested_catalog_(digital) user_initiated
website_initiated or, user_initiatedDT <- merge(
DT,
transform(
event_classification,
user_initiated = as.integer(initiated_by == "user_initiated")
)[ , c("event_name", "user_initiated")],
by = "event_name",
all.x = TRUE
)
setorder(DT, id, event_timestamp)
DT[, `:=`(
n_user_initiated = sum(user_initiated == 1),
n_website_initiated = sum(user_initiated == 0),
prop_user_initiated = mean(user_initiated == 1),
last_user_initiated = user_initiated[.N],
n_alternations_initiated = sum(user_initiated[-1] != user_initiated[-.N])
), by = id]
total_duration_minsDT[, total_duration_mins := as.numeric(max(event_timestamp) - min(event_timestamp), units = "mins"), by = id]
range(DT$total_duration)
## [1] 0.0 632034.2
hist(
DT$total_duration,
breaks = 60,
main = "Histogram of Total Journey Duration",
xlab = "Total Duration (min)",
ylab = "Frequency",
col = "lightblue",
border = "white"
)
avg_time_between_eventsDT[, avg_time_between_events := mean(as.numeric(diff(event_timestamp), units = "mins")), by = id]
hist(
log1p(DT$avg_time_between_events),
breaks = 60,
main = "Histogram of Log Average Time Between Events",
xlab = "log(1 + Average Time Between Events in min)",
ylab = "Frequency",
col = "lightblue",
border = "white"
)
DT[, `:=`(
events_per_hour = .N / (pmax(as.numeric(max(event_timestamp) - min(event_timestamp), units = "hours"), 1/60)),
events_per_day = .N / (pmax(as.numeric(max(event_timestamp) - min(event_timestamp), units = "days"), 1/(24*60)))
), by = id]
hist(
log1p(DT$events_per_day),
breaks = 60,
main = "Histogram of Events Per Day",
xlab = "Events Per Day",
ylab = "Frequency",
col = "lightblue",
border = "white"
)
hist(
log1p(DT$events_per_hour),
breaks = 60,
main = "Histogram of Events Per Hour",
xlab = "Events Per Hour",
ylab = "Frequency",
col = "lightblue",
border = "white"
)
DT[, `:=`(
reached_application_web_view = as.integer(any(event_name == "application_web_view")),
reached_application_web_submit = as.integer(any(event_name == "application_web_submit")),
reached_application_web_approved = as.integer(any(event_name == "application_web_approved")),
reached_add_to_cart = as.integer(any(event_name == "add_to_cart")),
reached_view_cart = as.integer(any(event_name == "view_cart")),
reached_begin_checkout = as.integer(any(event_name == "begin_checkout")),
reached_place_order = as.integer(any(event_name %in% c("place_order_web", "place_order_phone"))),
reached_place_downpayment = as.integer(any(event_name == "place_downpayment"))
), by = id]
DT[, `:=`(
n_browse_products = sum(event_name == "browse_products"),
n_application_web_view = sum(event_name == "application_web_view"),
n_add_to_cart = sum(event_name == "add_to_cart"),
n_view_cart = sum(event_name == "view_cart"),
n_begin_checkout = sum(event_name == "begin_checkout"),
n_campaign_click = sum(event_name == "campaign_click"),
n_campaignemail_clicked = sum(event_name == "campaignemail_clicked")
), by = id]
as.numeric(as.Date("2023-01-23") - as.Date("2020-11-03"))
## [1] 811