setwd("/Users/isaiahmireles/Desktop/M148 GP")
library(data.table)
## Warning: package 'data.table' was built under R version 4.4.3
dt <- fread("dat_train1.csv")
dt$date <- as.Date(dt$event_timestamp) #only dt not time -- dt obj
dt$dt_tm <- as.POSIXct(dt$event_timestamp, format = "%Y-%m-%dT%H:%M:%SZ", tz = "UTC")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
dt <-
dt |>
distinct(id, dt_tm, event_name, .keep_all = TRUE)
dt$order_shipped <- as.integer(dt$event_name == "order_shipped")
|—————- older data —————-|—- last 60 days —-|
setDT(dt)
# Define cutoff as the last observed timestamp in the dataset
cutoff_date <- max(dt$event_timestamp, na.rm = TRUE)
# One row per journey
journeys <- dt[
,
.(
first_activity = min(event_timestamp, na.rm = TRUE),
last_activity = max(event_timestamp, na.rm = TRUE),
n_events = .N,
has_order_shipped = any(order_shipped == 1)
),
by = id
]
# Classify journeys
journeys[
,
journey_status := fifelse(
has_order_shipped,
"successful",
fifelse(
last_activity < cutoff_date - 60 * 24 * 60 * 60,
"unsuccessful",
"ongoing"
)
)
]
# Optional: binary indicators
journeys[, successful := as.integer(journey_status == "successful")]
journeys[, unsuccessful := as.integer(journey_status == "unsuccessful")]
journeys[, ongoing := as.integer(journey_status == "ongoing")]
journeys[, .N, by = journey_status]
## journey_status N
## <char> <int>
## 1: unsuccessful 992757
## 2: ongoing 158325
## 3: successful 279363
992757/nrow(journeys)
## [1] 0.6940197
158325/nrow(journeys)
## [1] 0.1106823
279363/nrow(journeys)
## [1] 0.195298
Successful Journey: A journey that eventually
ends with an order_shipped event. Regardless of how long
the journey takes, if an order is shipped, the journey is considered
successful.
Unsuccessful Journey: A journey that does not
end with an order_shipped event and has had no activity for
at least 60 days before the cutoff date.
Ongoing (Incomplete) Journey: A journey that has
not ended with an order_shipped event, but whose most
recent activity occurred within 60 days of the cutoff date, meaning the
final outcome is still unknown.
# Join journey_status back onto the repeated-measures action-level data
dt <- dt |>
left_join(
journeys |>
as_tibble() |>
select(id, journey_status, successful, unsuccessful, ongoing),
by = "id"
)
dt <- dt |>
group_by(id) |>
mutate(
first_last_seconds = as.numeric(
max(dt_tm, na.rm = TRUE) - min(dt_tm, na.rm = TRUE),
units = "secs"
)
) |>
ungroup()
dt |>
distinct(id, first_last_seconds) |>
summarise(avg_days = mean(first_last_seconds)/(60*60*24))
## # A tibble: 1 × 1
## avg_days
## <dbl>
## 1 134.
dt |>
filter(successful==1) |>
distinct(id, first_last_seconds) |>
summarise(avg_days = mean(first_last_seconds)/(60*60*24))
## # A tibble: 1 × 1
## avg_days
## <dbl>
## 1 48.3
dt |>
filter(successful==0) |>
distinct(id, first_last_seconds) |>
summarise(avg_days = mean(first_last_seconds)/(60*60*24))
## # A tibble: 1 × 1
## avg_days
## <dbl>
## 1 154.
dt |>
filter(ongoing==1) |>
distinct(id, first_last_seconds) |>
summarise(avg_days = mean(first_last_seconds)/(60*60*24))
## # A tibble: 1 × 1
## avg_days
## <dbl>
## 1 57.9
dt |>
filter(unsuccessful==1) |>
distinct(id, first_last_seconds) |>
summarise(avg_days = mean(first_last_seconds)/(60*60*24))
## # A tibble: 1 × 1
## avg_days
## <dbl>
## 1 170.
Journey duration varies substantially by outcome type.
Successful journeys are much shorter on average (about 48 days), while unsuccessful journeys tend to last much longer (about 170 days). Ongoing journeys fall in between at roughly 58 days, suggesting that time between first and last activity may be an important predictor of eventual purchase success.
dt <- dt |>
arrange(id, dt_tm) |>
group_by(id) |>
mutate(
seconds_between_actions = as.numeric(
dt_tm - lag(dt_tm),
units = "secs"
)
) |>
ungroup()
dt |> group_by(journey_status) |> summarise(day_mu=mean(seconds_between_actions, na.rm=T)/(24*60^2), sec_md=median(seconds_between_actions, na.rm=T), sec_dy = sd(seconds_between_actions, na.rm=T)/((24*60^2)))
## # A tibble: 3 × 4
## journey_status day_mu sec_md sec_dy
## <chr> <dbl> <dbl> <dbl>
## 1 ongoing 1.69 45 12.1
## 2 successful 0.900 72 5.92
## 3 unsuccessful 5.62 119 17.7
Successful journeys have the shortest average time between actions (about 0.9 days), suggesting that users who eventually purchase tend to move through the process relatively quickly.
Ongoing journeys have moderate gaps between actions (about 1.7 days), indicating continued but slower engagement.
Unsuccessful journeys have much longer gaps between actions (about 5.6 days), suggesting that disengaged users interact far less frequently over time.
Median times between actions are much smaller than the means for all groups, indicating that the distributions are heavily right-skewed with some very large inactivity gaps.
Unsuccessful journeys also have the largest variability in time between actions, as shown by the much larger standard deviation.
ev_def <- read.csv("Event Definitions.csv")
ev_def |> distinct(event_name, stage)
## event_name stage
## 1 application_phone_approved Apply for Credit
## 2 application_phone_declined Apply for Credit
## 3 application_phone_pending Apply for Credit
## 4 application_web_approved Apply for Credit
## 5 application_web_declined Apply for Credit
## 6 application_web_pending Apply for Credit
## 7 application_web_submit Apply for Credit
## 8 application_web_view Apply for Credit
## 9 account_activitation Credit Account
## 10 account_fraud_review Credit Account
## 11 campaign_click Discover
## 12 customer_requested_catalog_digital Discover
## 13 fingerhut_university Discover
## 14 pre_application_3rd_party_affiliates Discover
## 15 site_registration Discover
## 16 account_downpaymentcleared Downpayment
## 17 account_downpaymentreceived Downpayment
## 18 place_downpayment Downpayment
## 19 place_downpayment_phone Downpayment
## 20 add_to_cart First Purchase
## 21 begin_checkout First Purchase
## 22 browse_products First Purchase
## 23 place_order_phone First Purchase
## 24 place_order_web First Purchase
## 25 view_cart First Purchase
## 26 order_shipped Order Shipped
## 27 catalog_email_experian Prospecting
## 28 catalog_mail Prospecting
dt <- dt |>
left_join(
ev_def |>
distinct(event_name, stage),
by = "event_name"
)
dt |> distinct(event_name, stage)
## # A tibble: 27 × 2
## event_name stage
## <chr> <chr>
## 1 application_web_approved Apply for Credit
## 2 browse_products First Purchase
## 3 application_web_view Apply for Credit
## 4 campaign_click Discover
## 5 add_to_cart First Purchase
## 6 view_cart First Purchase
## 7 begin_checkout First Purchase
## 8 promotion_created <NA>
## 9 catalog_(mail) <NA>
## 10 application_web_submit Apply for Credit
## # ℹ 17 more rows
Apply for Credit: Events related to customers viewing, submitting, and receiving decisions on credit applications (approved, declined, pending, phone/web applications).
Discover: Early-stage engagement events where users interact with marketing or discover the platform, such as campaign clicks, site registration, and Fingerhut educational content.
First Purchase: Shopping and checkout behaviors tied to browsing and purchasing products, including browsing products, adding items to cart, beginning checkout, and placing orders.
Downpayment: Events related to making or clearing required downpayments before completing a purchase.
Credit Account: Events involving activation or review of a customer’s credit account after approval.
Order Shipped: The final successful conversion event indicating that a purchase was completed and shipped.
journey_statusdt |>
distinct(id, event_name, stage, journey_status) |>
count(journey_status, stage) |>
group_by(journey_status) |>
mutate(prop = n / sum(n)) |>
arrange(journey_status, desc(prop))
## # A tibble: 19 × 4
## # Groups: journey_status [3]
## journey_status stage n prop
## <chr> <chr> <int> <dbl>
## 1 ongoing First Purchase 392245 0.399
## 2 ongoing Apply for Credit 340132 0.346
## 3 ongoing <NA> 138798 0.141
## 4 ongoing Discover 93925 0.0956
## 5 ongoing Credit Account 11538 0.0117
## 6 ongoing Downpayment 5979 0.00608
## 7 successful First Purchase 1327993 0.379
## 8 successful Apply for Credit 534069 0.152
## 9 successful Downpayment 533281 0.152
## 10 successful <NA> 393694 0.112
## 11 successful Order Shipped 279363 0.0796
## 12 successful Credit Account 278116 0.0793
## 13 successful Discover 161480 0.0460
## 14 unsuccessful First Purchase 2122198 0.323
## 15 unsuccessful <NA> 1895474 0.289
## 16 unsuccessful Apply for Credit 1854774 0.283
## 17 unsuccessful Discover 597686 0.0911
## 18 unsuccessful Credit Account 66289 0.0101
## 19 unsuccessful Downpayment 24692 0.00376
Across all journey types, the most common stage is First Purchase, indicating that most user activity centers around browsing, cart activity, and checkout behavior.
Successful journeys show substantially more activity in the Downpayment and Credit Account stages, suggesting these are strong signals of eventual conversion.
Ongoing journeys are concentrated in First Purchase and Apply for Credit, implying that many active users are still early or mid-way through the purchasing funnel.
Unsuccessful journeys have a much larger proportion of
NA and Apply for Credit events, suggesting
many users disengage before reaching later purchasing stages.
The Order Shipped stage appears only in successful journeys, as expected, and represents the final conversion event.
Overall, successful journeys appear to progress further through the purchasing pipeline, while unsuccessful journeys tend to stall earlier in the process.
# final events
last_stage <- dt |>
arrange(id, dt_tm) |>
group_by(id) |>
slice_tail(n = 1) |>
ungroup()
last_stage |>
count(journey_status, stage) |>
group_by(journey_status) |>
mutate(prop = n / sum(n))
## # A tibble: 16 × 4
## # Groups: journey_status [3]
## journey_status stage n prop
## <chr> <chr> <int> <dbl>
## 1 ongoing Apply for Credit 22865 0.144
## 2 ongoing Credit Account 1181 0.00746
## 3 ongoing Discover 22287 0.141
## 4 ongoing Downpayment 1015 0.00641
## 5 ongoing First Purchase 75396 0.476
## 6 ongoing <NA> 35581 0.225
## 7 successful Credit Account 103 0.000369
## 8 successful Downpayment 91 0.000326
## 9 successful Order Shipped 277991 0.995
## 10 successful <NA> 1178 0.00422
## 11 unsuccessful Apply for Credit 62733 0.0632
## 12 unsuccessful Credit Account 8143 0.00820
## 13 unsuccessful Discover 38826 0.0391
## 14 unsuccessful Downpayment 1765 0.00178
## 15 unsuccessful First Purchase 166015 0.167
## 16 unsuccessful <NA> 715275 0.720
Nearly all successful journeys end in the Order Shipped stage (about 99.5%), which confirms that successful users progress fully through the purchase funnel.
Ongoing journeys most commonly end in the First Purchase stage (about 47.6%), suggesting many active users are still browsing, shopping, or checking out.
Unsuccessful journeys overwhelmingly end on events with missing
(NA) stage labels (about 72%), indicating that many users
disengage during promotional or uncategorized interactions before
progressing further.
Among categorized stages, unsuccessful journeys most often stop during the First Purchase and Apply for Credit stages, suggesting that many users drop out before completing payment or shipment-related steps.
Very few unsuccessful or ongoing journeys reach the Downpayment or Credit Account stages, implying that progressing deeper into the funnel is strongly associated with eventual success.
Overall, the analysis suggests that customer behavior differs
substantially across successful, ongoing, and unsuccessful journeys.
Successful journeys tend to progress quickly through the purchasing
funnel, with shorter overall durations, shorter gaps between actions,
and much greater participation in later-stage activities such as
Downpayment and Credit Account events before ultimately reaching Order
Shipped. In contrast, unsuccessful journeys are characterized by much
longer durations, larger inactivity gaps between actions, and a tendency
to stall earlier in the funnel, particularly during First Purchase and
Apply for Credit stages. Many unsuccessful journeys also terminate on
uncategorized (NA) events, suggesting disengagement during
promotional or less structured interactions. Ongoing journeys generally
fall between these two extremes, with moderate durations and action
gaps, and are most commonly concentrated in the First Purchase stage,
indicating that many users are still actively browsing or considering a
purchase.
library(dplyr)
library(stringr)
paths <- dt |>
arrange(id, dt_tm) |>
group_by(id) |>
summarise(
path = str_c(stage, collapse = " -> "),
journey_status = first(journey_status)
)
length(unique(paths$path)) # 57092
## [1] 57092
top_paths <- paths |>
count(journey_status, path, sort = TRUE) |>
group_by(journey_status) |>
slice_head(n = 8)
top_paths |> select(journey_status, path)
## # A tibble: 24 × 2
## # Groups: journey_status [3]
## journey_status path
## <chr> <chr>
## 1 ongoing <NA>
## 2 ongoing Apply for Credit
## 3 ongoing Apply for Credit -> Discover
## 4 ongoing Discover -> Apply for Credit
## 5 ongoing Apply for Credit -> First Purchase
## 6 ongoing Apply for Credit -> First Purchase -> Discover
## 7 ongoing Apply for Credit -> Apply for Credit -> Apply for Credit -> A…
## 8 ongoing Apply for Credit -> Apply for Credit -> Apply for Credit -> A…
## 9 successful <NA>
## 10 successful Credit Account -> Apply for Credit -> First Purchase -> Downp…
## # ℹ 14 more rows
Successful journeys tend to follow structured progression paths that move through Apply for Credit, First Purchase, Credit Account, and Downpayment before ultimately reaching Order Shipped.
Both ongoing and unsuccessful journeys are heavily concentrated in early funnel stages, especially Apply for Credit and First Purchase, suggesting many users never progress deeper into the purchasing pipeline.
Repeated sequences such as long chains of Apply for Credit indicate that many users repeatedly interact with credit-related activities without converting.
The presence of Discover stages is more common among ongoing and unsuccessful journeys, suggesting that many users remain in exploratory or marketing-related behaviors rather than advancing toward purchase completion.
Overall, successful journeys appear to exhibit deeper and more complete stage progression, while ongoing and unsuccessful journeys tend to stall earlier and show less movement into Downpayment and Order Shipped stages.
downpayment_summary <- dt |>
group_by(id, journey_status) |>
summarise(
reached_downpayment = as.integer(any(stage == "Downpayment", na.rm = TRUE)),
.groups = "drop"
)
downpayment_summary |>
group_by(journey_status) |>
count(reached_downpayment) |>
mutate(
pct = round(n / sum(n), 3)
)
## # A tibble: 6 × 4
## # Groups: journey_status [3]
## journey_status reached_downpayment n pct
## <chr> <int> <int> <dbl>
## 1 ongoing 0 154448 0.976
## 2 ongoing 1 3877 0.024
## 3 successful 0 2065 0.007
## 4 successful 1 277298 0.993
## 5 unsuccessful 0 975739 0.983
## 6 unsuccessful 1 17018 0.017
Nearly all successful journeys (99.3%) reached the Downpayment stage, suggesting that progressing to downpayment activity is one of the strongest indicators of eventual order shipment success.
Very few ongoing (2.4%) or unsuccessful (1.7%) journeys reached the Downpayment stage, indicating that most users who fail to convert never progress that deeply into the purchasing funnel.
Downpayment stage appears to strongly separate successful journeys from non-successful journeys and is likely a highly predictive feature for future models.