Read Dat

setwd("/Users/isaiahmireles/Desktop/M148 GP")
library(data.table)
## Warning: package 'data.table' was built under R version 4.4.3
dt <- fread("dat_train1.csv")

Time Features

dt$date <- as.Date(dt$event_timestamp) #only dt not time -- dt obj
dt$dt_tm <- as.POSIXct(dt$event_timestamp, format = "%Y-%m-%dT%H:%M:%SZ", tz = "UTC")

Rm Duplicates

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
dt <-
  dt |> 
  distinct(id, dt_tm, event_name, .keep_all = TRUE)

Order Shipped Logical

dt$order_shipped <- as.integer(dt$event_name == "order_shipped")

Journeys

|—————- older data —————-|—- last 60 days —-|

setDT(dt)

# Define cutoff as the last observed timestamp in the dataset
cutoff_date <- max(dt$event_timestamp, na.rm = TRUE)

# One row per journey
journeys <- dt[
  ,
  .(
    first_activity = min(event_timestamp, na.rm = TRUE),
    last_activity  = max(event_timestamp, na.rm = TRUE),
    n_events       = .N,
    has_order_shipped = any(order_shipped == 1)
  ),
  by = id
]

# Classify journeys
journeys[
  ,
  journey_status := fifelse(
    has_order_shipped,
    "successful",
    fifelse(
      last_activity < cutoff_date - 60 * 24 * 60 * 60,
      "unsuccessful",
      "ongoing"
    )
  )
]

# Optional: binary indicators
journeys[, successful   := as.integer(journey_status == "successful")]
journeys[, unsuccessful := as.integer(journey_status == "unsuccessful")]
journeys[, ongoing      := as.integer(journey_status == "ongoing")]

journeys[, .N, by = journey_status]
##    journey_status      N
##            <char>  <int>
## 1:   unsuccessful 992757
## 2:        ongoing 158325
## 3:     successful 279363
992757/nrow(journeys)
## [1] 0.6940197
158325/nrow(journeys)
## [1] 0.1106823
279363/nrow(journeys)
## [1] 0.195298

Definitions

Repeated Measures – Journey

# Join journey_status back onto the repeated-measures action-level data
dt <- dt |>
  left_join(
    journeys |>
      as_tibble() |>
      select(id, journey_status, successful, unsuccessful, ongoing),
    by = "id"
  )

Time from first to last action

dt <- dt |>
  group_by(id) |>
  mutate(
    first_last_seconds = as.numeric(
      max(dt_tm, na.rm = TRUE) - min(dt_tm, na.rm = TRUE),
      units = "secs"
    )
  ) |>
  ungroup()
dt |>
  distinct(id, first_last_seconds) |>
  summarise(avg_days = mean(first_last_seconds)/(60*60*24))
## # A tibble: 1 × 1
##   avg_days
##      <dbl>
## 1     134.
dt |>
  filter(successful==1) |> 
  distinct(id, first_last_seconds) |>
  summarise(avg_days = mean(first_last_seconds)/(60*60*24))
## # A tibble: 1 × 1
##   avg_days
##      <dbl>
## 1     48.3
dt |>
  filter(successful==0) |> 
  distinct(id, first_last_seconds) |>
  summarise(avg_days = mean(first_last_seconds)/(60*60*24))
## # A tibble: 1 × 1
##   avg_days
##      <dbl>
## 1     154.
dt |>
  filter(ongoing==1) |> 
  distinct(id, first_last_seconds) |>
  summarise(avg_days = mean(first_last_seconds)/(60*60*24))
## # A tibble: 1 × 1
##   avg_days
##      <dbl>
## 1     57.9
dt |>
  filter(unsuccessful==1) |> 
  distinct(id, first_last_seconds) |>
  summarise(avg_days = mean(first_last_seconds)/(60*60*24))
## # A tibble: 1 × 1
##   avg_days
##      <dbl>
## 1     170.

TLDR

Time Between Actions

dt <- dt |>
  arrange(id, dt_tm) |>
  group_by(id) |>
  mutate(
    seconds_between_actions = as.numeric(
      dt_tm - lag(dt_tm),
      units = "secs"
    )
  ) |>
  ungroup()
dt |> group_by(journey_status) |> summarise(day_mu=mean(seconds_between_actions, na.rm=T)/(24*60^2), sec_md=median(seconds_between_actions, na.rm=T), sec_dy = sd(seconds_between_actions, na.rm=T)/((24*60^2)))
## # A tibble: 3 × 4
##   journey_status day_mu sec_md sec_dy
##   <chr>           <dbl>  <dbl>  <dbl>
## 1 ongoing         1.69      45  12.1 
## 2 successful      0.900     72   5.92
## 3 unsuccessful    5.62     119  17.7

TLDR

Paste Stage

ev_def <- read.csv("Event Definitions.csv")

ev_def |> distinct(event_name, stage)
##                              event_name            stage
## 1            application_phone_approved Apply for Credit
## 2            application_phone_declined Apply for Credit
## 3             application_phone_pending Apply for Credit
## 4              application_web_approved Apply for Credit
## 5              application_web_declined Apply for Credit
## 6               application_web_pending Apply for Credit
## 7                application_web_submit Apply for Credit
## 8                  application_web_view Apply for Credit
## 9                  account_activitation   Credit Account
## 10                 account_fraud_review   Credit Account
## 11                       campaign_click         Discover
## 12   customer_requested_catalog_digital         Discover
## 13                 fingerhut_university         Discover
## 14 pre_application_3rd_party_affiliates         Discover
## 15                    site_registration         Discover
## 16           account_downpaymentcleared      Downpayment
## 17          account_downpaymentreceived      Downpayment
## 18                    place_downpayment      Downpayment
## 19              place_downpayment_phone      Downpayment
## 20                          add_to_cart   First Purchase
## 21                       begin_checkout   First Purchase
## 22                      browse_products   First Purchase
## 23                    place_order_phone   First Purchase
## 24                      place_order_web   First Purchase
## 25                            view_cart   First Purchase
## 26                        order_shipped    Order Shipped
## 27               catalog_email_experian      Prospecting
## 28                         catalog_mail      Prospecting
dt <- dt |>
  left_join(
    ev_def |>
      distinct(event_name, stage),
    by = "event_name"
  )
dt |> distinct(event_name, stage)
## # A tibble: 27 × 2
##    event_name               stage           
##    <chr>                    <chr>           
##  1 application_web_approved Apply for Credit
##  2 browse_products          First Purchase  
##  3 application_web_view     Apply for Credit
##  4 campaign_click           Discover        
##  5 add_to_cart              First Purchase  
##  6 view_cart                First Purchase  
##  7 begin_checkout           First Purchase  
##  8 promotion_created        <NA>            
##  9 catalog_(mail)           <NA>            
## 10 application_web_submit   Apply for Credit
## # ℹ 17 more rows

Frequency of Stage – by journey_status

dt |>
  distinct(id, event_name, stage, journey_status) |>
  count(journey_status, stage) |>
  group_by(journey_status) |>
  mutate(prop = n / sum(n)) |>
  arrange(journey_status, desc(prop))
## # A tibble: 19 × 4
## # Groups:   journey_status [3]
##    journey_status stage                  n    prop
##    <chr>          <chr>              <int>   <dbl>
##  1 ongoing        First Purchase    392245 0.399  
##  2 ongoing        Apply for Credit  340132 0.346  
##  3 ongoing        <NA>              138798 0.141  
##  4 ongoing        Discover           93925 0.0956 
##  5 ongoing        Credit Account     11538 0.0117 
##  6 ongoing        Downpayment         5979 0.00608
##  7 successful     First Purchase   1327993 0.379  
##  8 successful     Apply for Credit  534069 0.152  
##  9 successful     Downpayment       533281 0.152  
## 10 successful     <NA>              393694 0.112  
## 11 successful     Order Shipped     279363 0.0796 
## 12 successful     Credit Account    278116 0.0793 
## 13 successful     Discover          161480 0.0460 
## 14 unsuccessful   First Purchase   2122198 0.323  
## 15 unsuccessful   <NA>             1895474 0.289  
## 16 unsuccessful   Apply for Credit 1854774 0.283  
## 17 unsuccessful   Discover          597686 0.0911 
## 18 unsuccessful   Credit Account     66289 0.0101 
## 19 unsuccessful   Downpayment        24692 0.00376

TLDR

What Stage do people end on?

# final events
last_stage <- dt |>
  arrange(id, dt_tm) |>
  group_by(id) |>
  slice_tail(n = 1) |>
  ungroup()

last_stage |>
  count(journey_status, stage) |>
  group_by(journey_status) |>
  mutate(prop = n / sum(n))
## # A tibble: 16 × 4
## # Groups:   journey_status [3]
##    journey_status stage                 n     prop
##    <chr>          <chr>             <int>    <dbl>
##  1 ongoing        Apply for Credit  22865 0.144   
##  2 ongoing        Credit Account     1181 0.00746 
##  3 ongoing        Discover          22287 0.141   
##  4 ongoing        Downpayment        1015 0.00641 
##  5 ongoing        First Purchase    75396 0.476   
##  6 ongoing        <NA>              35581 0.225   
##  7 successful     Credit Account      103 0.000369
##  8 successful     Downpayment          91 0.000326
##  9 successful     Order Shipped    277991 0.995   
## 10 successful     <NA>               1178 0.00422 
## 11 unsuccessful   Apply for Credit  62733 0.0632  
## 12 unsuccessful   Credit Account     8143 0.00820 
## 13 unsuccessful   Discover          38826 0.0391  
## 14 unsuccessful   Downpayment        1765 0.00178 
## 15 unsuccessful   First Purchase   166015 0.167   
## 16 unsuccessful   <NA>             715275 0.720

TLDR

Overall Summary

Overall, the analysis suggests that customer behavior differs substantially across successful, ongoing, and unsuccessful journeys. Successful journeys tend to progress quickly through the purchasing funnel, with shorter overall durations, shorter gaps between actions, and much greater participation in later-stage activities such as Downpayment and Credit Account events before ultimately reaching Order Shipped. In contrast, unsuccessful journeys are characterized by much longer durations, larger inactivity gaps between actions, and a tendency to stall earlier in the funnel, particularly during First Purchase and Apply for Credit stages. Many unsuccessful journeys also terminate on uncategorized (NA) events, suggesting disengagement during promotional or less structured interactions. Ongoing journeys generally fall between these two extremes, with moderate durations and action gaps, and are most commonly concentrated in the First Purchase stage, indicating that many users are still actively browsing or considering a purchase.

Path Analysis

library(dplyr)
library(stringr)

paths <- dt |>
  arrange(id, dt_tm) |>
  group_by(id) |>
  summarise(
    path = str_c(stage, collapse = " -> "),
    journey_status = first(journey_status)
  )

length(unique(paths$path)) # 57092
## [1] 57092

Common Paths by journey_status

top_paths <- paths |>
  count(journey_status, path, sort = TRUE) |>
  group_by(journey_status) |>
  slice_head(n = 8)
top_paths |> select(journey_status, path)
## # A tibble: 24 × 2
## # Groups:   journey_status [3]
##    journey_status path                                                          
##    <chr>          <chr>                                                         
##  1 ongoing        <NA>                                                          
##  2 ongoing        Apply for Credit                                              
##  3 ongoing        Apply for Credit -> Discover                                  
##  4 ongoing        Discover -> Apply for Credit                                  
##  5 ongoing        Apply for Credit -> First Purchase                            
##  6 ongoing        Apply for Credit -> First Purchase -> Discover                
##  7 ongoing        Apply for Credit -> Apply for Credit -> Apply for Credit -> A…
##  8 ongoing        Apply for Credit -> Apply for Credit -> Apply for Credit -> A…
##  9 successful     <NA>                                                          
## 10 successful     Credit Account -> Apply for Credit -> First Purchase -> Downp…
## # ℹ 14 more rows

TLDR

Reached Down Payment Stage

downpayment_summary <- dt |>
  group_by(id, journey_status) |>
  summarise(
    reached_downpayment = as.integer(any(stage == "Downpayment", na.rm = TRUE)),
    .groups = "drop"
  )
downpayment_summary |>
  group_by(journey_status) |>
  count(reached_downpayment) |>
  mutate(
    pct = round(n / sum(n), 3)
  )
## # A tibble: 6 × 4
## # Groups:   journey_status [3]
##   journey_status reached_downpayment      n   pct
##   <chr>                        <int>  <int> <dbl>
## 1 ongoing                          0 154448 0.976
## 2 ongoing                          1   3877 0.024
## 3 successful                       0   2065 0.007
## 4 successful                       1 277298 0.993
## 5 unsuccessful                     0 975739 0.983
## 6 unsuccessful                     1  17018 0.017

TLDR

Downpayment stage appears to strongly separate successful journeys from non-successful journeys and is likely a highly predictive feature for future models.