library(data.table)
## Warning: package 'data.table' was built under R version 4.4.3
DT <- fread("clean_dat.csv") # generate as data.table so operations are faster
load("/Users/isaiahmireles/Downloads/sample_summary_stats.RData")

Successful_Journey Feature

setorder(DT, id, event_timestamp)
DT[, Successful_Journey := as.integer(event_name[.N] == "order_shipped"), by = id]
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
DT |>
  count(Successful_Journey) |>
  mutate(percent = n / sum(n)) 
##    Successful_Journey        n   percent
##                 <int>    <int>     <num>
## 1:                  0 36680999 0.7074601
## 2:                  1 15167862 0.2925399

Unsuccessful/Ongoing Label

# for each id ordered from earliest to latest event
setorder(DT, id, event_timestamp)

# final date observed in the whole dataset
dataset_end <- max(DT$event_timestamp)

# 1 = ongoing, 0 = not ongoing
DT[, ongoing := as.integer(
  Successful_Journey == 0 &
  as.numeric(dataset_end - event_timestamp[.N], units = "days") < 60
), by = id]
events <- unique(DT$event_name)

event_classification <- data.frame(
  event_name = events,
  initiated_by = c(
    "website_initiated",  # application_web_approved
    "user_initiated",     # browse_products
    "user_initiated",     # application_web_view
    "user_initiated",     # campaign_click
    "user_initiated",     # add_to_cart
    "user_initiated",     # view_cart
    "user_initiated",     # begin_checkout
    "website_initiated",  # promotion_created
    "website_initiated",  # catalog_(mail)
    "user_initiated",     # application_web_submit
    "website_initiated",  # account_activitation
    "user_initiated",     # place_order_web
    "user_initiated",     # campaignemail_clicked
    "website_initiated",  # application_phone_approved
    "user_initiated",     # pre-application_(3rd_party_affiliates)
    "user_initiated",     # place_downpayment
    "website_initiated",  # account_downpaymentcleared
    "website_initiated",  # order_shipped
    "user_initiated",     # place_order_phone
    "website_initiated",  # application_web_declined
    "user_initiated",     # site_registration
    "website_initiated",  # account_downpaymentreceived
    "website_initiated",  # catalog_(email)_(experian)
    "website_initiated",  # application_phone_declined
    "user_initiated",     # fingerhut_university
    "website_initiated",  # application_web_pending
    "user_initiated"      # customer_requested_catalog_(digital)
  ),
  stringsAsFactors = FALSE
)

event_classification
##                                event_name      initiated_by
## 1                application_web_approved website_initiated
## 2                         browse_products    user_initiated
## 3                    application_web_view    user_initiated
## 4                          campaign_click    user_initiated
## 5                             add_to_cart    user_initiated
## 6                               view_cart    user_initiated
## 7                          begin_checkout    user_initiated
## 8                       promotion_created website_initiated
## 9                          catalog_(mail) website_initiated
## 10                 application_web_submit    user_initiated
## 11                   account_activitation website_initiated
## 12                        place_order_web    user_initiated
## 13                  campaignemail_clicked    user_initiated
## 14             application_phone_approved website_initiated
## 15 pre-application_(3rd_party_affiliates)    user_initiated
## 16                      place_downpayment    user_initiated
## 17             account_downpaymentcleared website_initiated
## 18                          order_shipped website_initiated
## 19                      place_order_phone    user_initiated
## 20               application_web_declined website_initiated
## 21                      site_registration    user_initiated
## 22            account_downpaymentreceived website_initiated
## 23             catalog_(email)_(experian) website_initiated
## 24             application_phone_declined website_initiated
## 25                   fingerhut_university    user_initiated
## 26                application_web_pending website_initiated
## 27   customer_requested_catalog_(digital)    user_initiated
DT <- merge(
  DT,
  transform(
    event_classification,
    user_initiated = as.integer(initiated_by == "user_initiated")
  )[ , c("event_name", "user_initiated")],
  by = "event_name",
  all.x = TRUE
)
setorder(DT, id, event_timestamp)
DT[, `:=`(
  n_user_initiated       = sum(user_initiated == 1),
  n_website_initiated    = sum(user_initiated == 0),
  prop_user_initiated    = mean(user_initiated == 1),
  last_user_initiated    = user_initiated[.N],
  n_alternations_initiated = sum(user_initiated[-1] != user_initiated[-.N])
), by = id]

Journey size and speed

total_duration_mins

DT[, total_duration_mins := as.numeric(max(event_timestamp) - min(event_timestamp), units = "mins"), by = id]
  • here we consider how long the user has been on the website in min
range(DT$total_duration)
## [1]      0.0 632034.2
hist(
  DT$total_duration,
  breaks = 60,
  main = "Histogram of Total Journey Duration",
  xlab = "Total Duration (min)",
  ylab = "Frequency",
  col = "lightblue",
  border = "white"
)

  • we notice a bimodal dist where some people are spending almost no time while others near 7500 minutes

avg_time_between_events

DT[, avg_time_between_events := mean(as.numeric(diff(event_timestamp), units = "mins")), by = id]
  • here we consider the first to last time stamp time
hist(
  log1p(DT$avg_time_between_events),
  breaks = 60,
  main = "Histogram of Log Average Time Between Events",
  xlab = "log(1 + Average Time Between Events in min)",
  ylab = "Frequency",
  col = "lightblue",
  border = "white"
)

  • we log transform our data to normalize
DT[, `:=`(
  events_per_hour         = .N / (pmax(as.numeric(max(event_timestamp) - min(event_timestamp), units = "hours"), 1/60)),
  events_per_day          = .N / (pmax(as.numeric(max(event_timestamp) - min(event_timestamp), units = "days"), 1/(24*60)))
), by = id]
hist(
  log1p(DT$events_per_day),
  breaks = 60,
  main = "Histogram of Events Per Day",
  xlab = "Events Per Day",
  ylab = "Frequency",
  col = "lightblue",
  border = "white"
)

hist(
  log1p(DT$events_per_hour),
  breaks = 60,
  main = "Histogram of Events Per Hour",
  xlab = "Events Per Hour",
  ylab = "Frequency",
  col = "lightblue",
  border = "white"
)

Funnel progression

DT[, `:=`(
  reached_application_web_view     = as.integer(any(event_name == "application_web_view")),
  reached_application_web_submit   = as.integer(any(event_name == "application_web_submit")),
  reached_application_web_approved = as.integer(any(event_name == "application_web_approved")),
  reached_add_to_cart              = as.integer(any(event_name == "add_to_cart")),
  reached_view_cart                = as.integer(any(event_name == "view_cart")),
  reached_begin_checkout           = as.integer(any(event_name == "begin_checkout")),
  reached_place_order              = as.integer(any(event_name %in% c("place_order_web", "place_order_phone"))),
  reached_place_downpayment        = as.integer(any(event_name == "place_downpayment"))
), by = id]
DT[, `:=`(
  n_browse_products        = sum(event_name == "browse_products"),
  n_application_web_view   = sum(event_name == "application_web_view"),
  n_add_to_cart            = sum(event_name == "add_to_cart"),
  n_view_cart              = sum(event_name == "view_cart"),
  n_begin_checkout         = sum(event_name == "begin_checkout"),
  n_campaign_click         = sum(event_name == "campaign_click"),
  n_campaignemail_clicked  = sum(event_name == "campaignemail_clicked")
), by = id]

Reading in Data

Modeling :

EDA

as.numeric(as.Date("2023-01-23") - as.Date("2020-11-03"))
## [1] 811