setwd("/Users/isaiahmireles/Desktop/M148 GP")
library(data.table)
DT <- fread("clean_dat.csv")

Task 3 – Time Series

Create a time series for the number of orders shipped. Compute the number of orders shipped per day, week, and month.

# Keep only rows with order shipped 
DT_ship <- DT[event_name == "order_shipped"]
nrow(DT_ship)
## [1] 279363
library(dplyr)
# early - late
DT_ship <- 
  DT_ship |> arrange(dt_tm) 

day_agg

day_agg <- DT_ship |> group_by(date) |> count(event_name)

plot(day_agg$date, day_agg$n)

range(day_agg$n)
## [1]    1 1619

monthly_agg

?summarise

library(lubridate)
monthly_agg <- 
  DT_ship |>
  mutate(month = floor_date(date, "month")) |> 
  group_by(month) |>
  count(event_name)

plot(monthly_agg$month, monthly_agg$n)

?floor_date

yr_agg

yr_agg <- 
  DT_ship |>
  mutate(year = floor_date(date, "year")) |> 
  group_by(year) |>
  count(event_name)

plot(yr_agg$year, yr_agg$n)

Data

setwd("/Users/isaiahmireles/Desktop/M148 GP")
dt <- fread("dat_train1.csv")
dt2 <- fread("clean_dat.csv")

Thomas Feedback:

Total orders shipped per-day approach

# just want to summarize data for total orders shipped per-day

daily_data <- dt[, .(
  y = sum(event_name=="order_shipped")
), by = .(ds = as.IDate(event_timestamp))]

# first couple weeks are all ZERO! in-progress orders were excluded
# need to find a later start point - get avg length of successful journey and take pctile

# any duplicate rows?
# anyDuplicated(prophet_train_data$ds)

Total orders shipped per-month

prophet_train_data <- dt[
  event_name == "order_shipped", 
  .(ds = as.IDate(event_timestamp))] |>
  _[, .(y = .N), by = .(ds = round(ds, "months"))]

First-action date p-success approach

# drop incomplete rows

train_cutoff_date <- as.IDate(max(dt$event_timestamp))

# flatten data by user

user_dt <- dt[, .(
    first_action_date = as.IDate(min(event_timestamp)),
    last_action_date = as.IDate(max(event_timestamp)),
    has_shipped = any(event_name == "order_shipped"),
    total_actions = .N
), by = id]

# define days since last action as days_inactive

user_dt[, days_inactive := as.integer(train_cutoff_date - last_action_date)]

user_dt[, journey_outcome := fcase(
    has_shipped == TRUE, "success",
    has_shipped == FALSE & days_inactive > 60, "failure",
    default = "incomplete"
)]

table(user_dt$journey_outcome)
## 
##    failure incomplete    success 
##     992129     158953     279363
# drop incomplete journeys

user_dt_complete <- user_dt[journey_outcome != "incomplete", ]

# want to find distr. journey length for a successes and add upper end to cutoff date to reduce survivorship bias

successes_only <- user_dt_complete[journey_outcome == "success"]

purchase_times <- dt[event_name == "order_shipped", 
                     .(purchase_date = as.IDate(min(event_timestamp))), 
                     by = id]

# join back to find the journey length
journey_lengths <- merge(successes_only, purchase_times, by = "id")
journey_lengths[, days_to_buy := as.integer(purchase_date - first_action_date)]

hist(journey_lengths$days_to_buy, breaks = 100, freq=FALSE)

# find the 95th percentile of how long it takes people to buy
p95_days_success <- quantile(journey_lengths$days_to_buy, probs = 0.95, na.rm = TRUE)
print(paste("95% of successful users buy within", round(p95_days_success), "days"))
## [1] "95% of successful users buy within 221 days"
p80_days_success <- quantile(journey_lengths$days_to_buy, probs = 0.80, na.rm = TRUE)

# define safe cohort start date
# Safe Date = Cutoff - (95th Percentile Days + 60 Days Inactivity)
safe_cutoff <- train_cutoff_date - (round(p95_days_success) + 60)

cohort_users <- user_dt_complete[first_action_date <= safe_cutoff]

prophet_train_data <- cohort_users[, .(
    total_users = .N,
    successes = sum(journey_outcome == "success"),
    y = sum(journey_outcome == "success") / .N # Prophet requires the target to be named 'y'
), by = .(ds = first_action_date)] # Prophet requires the date to be named 'ds'

Model fitting

fitting to orders-per-day

library(prophet)
## Warning: package 'prophet' was built under R version 4.4.3
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 4.4.3
## Loading required package: rlang
## 
## Attaching package: 'rlang'
## The following object is masked from 'package:data.table':
## 
##     :=
# Initialize Prophet for daily data
m <- prophet(
  yearly.seasonality = TRUE,
  weekly.seasonality = TRUE,
  daily.seasonality = FALSE
)
m <- add_country_holidays(m, country_name = "US")
m <- fit.prophet(m, prophet_train_data)

# forecast through end of 2024:

# calculate days needed to reach Dec 2024
last_ds <- max(prophet_train_data$ds)
n_days <- length(seq(last_ds, as.IDate("2024-12-01"), by = "day")) - 1

# Create future dataframe at daily frequency
future_daily <- make_future_dataframe(m, periods = max(0, n_days), freq = "day")
forecast_daily <- predict(m, future_daily)

fitting to orders-per-month

library(prophet)

# Initialize Prophet for monthly data
m <- prophet(
  yearly.seasonality = TRUE,
  weekly.seasonality = FALSE,
  daily.seasonality = FALSE
)

m <- fit.prophet(m, prophet_train_data)

# Calculate months needed to reach Dec 2024
last_ds <- max(prophet_train_data$ds)
n_months <- length(seq(last_ds, as.IDate("2024-12-01"), by = "month")) - 1

# Create future dataframe at monthly frequency
future_monthly <- make_future_dataframe(m, periods = max(0, n_months), freq = "month")
forecast_monthly <- predict(m, future_monthly)

fitting to cohort method

library(prophet)

m <- prophet(yearly.seasonality = TRUE, weekly.seasonality=TRUE)
m <- add_country_holidays(m, country_name = "US")
m <- fit.prophet(m, prophet_train_data)
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
# calculate exactly how many days of "incomplete" cohorts we need to forecast
days_to_forecast <- as.numeric(train_cutoff_date - safe_cutoff)

# create a dataframe containing both historical dates AND future dates
future_dates <- make_future_dataframe(m, periods = days_to_forecast)

# generate the forecast
# This creates a massive dataframe containing the prediction (yhat) and all components
forecast <- predict(m, future_dates)

Visualiations

library(ggplot2)
library(dplyr)
library(patchwork)

daily orders model

plot(m, forecast_daily) +
  labs(title = "Daily Orders Forecast through 2024", x = "Date", y = "Orders Shipped") +
  theme_minimal()

# ggsave("prophet-plots/daily_orders_forecast.png", width = 8, height = 4)

# components

comp_plot_list <- prophet_plot_components(m, forecast_daily)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the prophet package.
##   Please report the issue at <https://github.com/facebook/prophet/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

wrap_plots(comp_plot_list, ncol = 1, nrow=4) +
  theme_minimal()

# ggsave("prophet-plots/daily_orders_components.png", width = 8, height = 5)

yoy_holiday_data <- forecast_daily %>%
  select(ds, holidays) %>%
  mutate(
    # Extract the actual year to use as our color category
    Year = as.factor(year(ds)),
    
    # Force every date to occur in 2024 (a leap year to protect Feb 29)
    # This aligns 2022, 2023, and 2024 exactly on top of each other
    dummy_date = update(ds, year = 2023)
  )

ggplot(yoy_holiday_data, aes(x = dummy_date, y = holidays, color = Year)) +
  # Baseline zero line
  geom_hline(yintercept = 0, color = "darkgray", linetype = "dashed") +
  
  # Draw the lines (alpha = 0.7 makes overlapping years easier to see through)
  geom_line(linewidth = .5, alpha = 0.5) +
  
  # # Add dots only on the actual holidays
  # geom_point(data = yoy_holiday_data %>% filter(holidays != 0), size = 1) +
  
  # Force the X-axis to label every single month using the dummy dates
  scale_x_datetime(date_breaks = "1 month", date_labels = "%b") +

  scale_color_brewer() +
  
  labs(
    title = "Year-Over-Year Holiday Effect on Orders Shipped",
    subtitle = "Baseline effect during US National Holidays",
    x = "Month",
    y = "Holiday Effect",
    color = "Year"
  ) +
  theme_minimal()

monthly model

library(ggplot2)
library(dplyr)

# Visualize monthly forecast
plot(m, forecast_monthly) +
  labs(title = "Monthly Orders Forecast through 2024", x = "Month", y = "Orders Shipped") +
  theme_minimal()

# Plot 1: The Main Forecast 
plot(m, forecast) +
  ggtitle("FingerHut Cohort Success Rate Forecast") +
  xlab("Cohort Start Date") + 
  ylab("Baseline Success Proportion") +
  theme_minimal()

# Plot 2: The Components Breakdown
prophet_plot_components(m, forecast)

# plot 3: zoom in on holiday year-over-year

yoy_holiday_data <- forecast %>%
  select(ds, holidays) %>%
  mutate(
    # Extract the actual year to use as our color category
    Year = as.factor(year(ds)),
    
    # Force every date to occur in 2024 (a leap year to protect Feb 29)
    # This aligns 2022, 2023, and 2024 exactly on top of each other
    dummy_date = update(ds, year = 2023)
  )

ggplot(yoy_holiday_data, aes(x = dummy_date, y = holidays, color = Year)) +
  # Baseline zero line
  geom_hline(yintercept = 0, color = "darkgray", linetype = "dashed") +
  
  # Draw the lines (alpha = 0.7 makes overlapping years easier to see through)
  geom_line(linewidth = .5, alpha = 0.7) +
  
  # Add dots only on the actual holidays
  geom_point(data = yoy_holiday_data %>% filter(holidays != 0), size = 1) +
  
  # Force the X-axis to label every single month using the dummy dates
  scale_x_datetime(date_breaks = "1 month", date_labels = "%b") +
  
  labs(
    title = "Year-Over-Year Holiday Effect on Cohort Quality",
    subtitle = "Baseline success rate multiplier during US National Holidays",
    x = "Month",
    y = "Holiday Effect",
    color = "Cohort Year"
  ) +
  theme_minimal()

Task 1

Configuring Data

dt3 <- fread("dat_train1.csv")

dt feature(s)

dt3$date <- as.Date(dt3$event_timestamp) #only dt not time -- dt obj
dt3$dt_tm <- as.POSIXct(dt3$event_timestamp, format = "%Y-%m-%dT%H:%M:%SZ", tz = "UTC")

dt range

range(dt3$date)
## [1] "2020-11-03" "2023-01-23"
range(dt3$dt_tm)
## [1] "2020-11-03 03:31:30 UTC" "2023-01-23 12:29:56 UTC"
  • \[1\] “2020-11-03” “2023-01-23”

  • \[1\] “2020-11-03 03:31:30 UTC” “2023-01-23 12:29:56 UTC”

counts – uniqueness

length(unique(dt3$customer_id))
## [1] 1391421
  • 1,391,421 unique cust. id
length(unique(dt3$account_id))
## [1] 1430230
  • 1,430,230 unique cust. id
length(unique(dt3$id))
## [1] 1430445
  • 1,430,445 unique id (pasted)
df <- dt3

multi_customers_per_account

library(dplyr)
multi_customers_per_account <- df %>%
  distinct(customer_id, account_id) %>%
  group_by(account_id) %>%
  summarise(n_customers = n()) %>%
  filter(n_customers > 1)

# count how many such accounts
nrow(multi_customers_per_account)
## [1] 215
# relative importance : negligible issue : frac of pct -- abt 0%
round(nrow(multi_customers_per_account)/df %>%
  distinct(customer_id, account_id) |> nrow(), 5)
## [1] 0.00015

multi_accts_per_customer

multi_accts_per_customer <- df %>%
  distinct(customer_id, account_id) %>%   # remove duplicate rows first
  group_by(customer_id) %>%
  summarise(n_accounts = n()) %>%
  filter(n_accounts > 1)

# count how many such customers
nrow(multi_accts_per_customer)
## [1] 38757
# relative importance : almost negligible issue : 3%
round(nrow(multi_accts_per_customer)/df |> 
  distinct(customer_id, account_id) |> nrow(), 5)
## [1] 0.02709

Remove Duplicates

df <- df %>%
  distinct(id, dt_tm, event_name, .keep_all = TRUE)
nrow(df)
## [1] 51848861

Number of customer, acct pairs

df %>% summarize(count = n_distinct(customer_id, account_id))
##     count
## 1 1430445
  • There are 1,430,445 customer act pairs (not counting how the list of events they may take just counting how many unique pairs there are)

ct Journey Steps

journeys <- df %>%
  arrange(customer_id, account_id, dt_tm) %>%
  group_by(customer_id, account_id) %>%
  summarise(
    event_names = list(event_name),
    event_timestamps = list(dt_tm),
    max_journey_steps_until_end = max(journey_steps_until_end),
    .groups = "drop"
  )

What do journeys look like?

This is the sequence of events ordered by dt_tm of a particular customer_id, account_id pair?

journeys$event_names[1:3]
## [[1]]
##  [1] "application_web_approved" "view_cart"               
##  [3] "browse_products"          "add_to_cart"             
##  [5] "view_cart"                "begin_checkout"          
##  [7] "promotion_created"        "browse_products"         
##  [9] "add_to_cart"              "browse_products"         
## [11] "add_to_cart"              "view_cart"               
## [13] "begin_checkout"           "view_cart"               
## [15] "browse_products"          "add_to_cart"             
## [17] "view_cart"                "begin_checkout"          
## [19] "promotion_created"        "catalog_(mail)"          
## [21] "promotion_created"        "catalog_(mail)"          
## [23] "promotion_created"        "promotion_created"       
## [25] "catalog_(mail)"           "promotion_created"       
## [27] "promotion_created"        "promotion_created"       
## [29] "promotion_created"       
## 
## [[2]]
##  [1] "begin_checkout"           "application_web_view"    
##  [3] "application_web_view"     "application_web_view"    
##  [5] "application_web_view"     "application_web_view"    
##  [7] "application_web_view"     "application_web_view"    
##  [9] "promotion_created"        "browse_products"         
## [11] "application_web_view"     "application_web_view"    
## [13] "application_web_view"     "application_web_view"    
## [15] "application_web_submit"   "application_web_view"    
## [17] "application_web_approved" "view_cart"               
## [19] "begin_checkout"           "view_cart"               
## [21] "begin_checkout"           "view_cart"               
## [23] "promotion_created"        "catalog_(mail)"          
## [25] "promotion_created"        "promotion_created"       
## [27] "catalog_(mail)"           "promotion_created"       
## [29] "promotion_created"        "catalog_(mail)"          
## [31] "promotion_created"        "catalog_(mail)"          
## 
## [[3]]
##  [1] "application_web_approved" "promotion_created"       
##  [3] "browse_products"          "add_to_cart"             
##  [5] "view_cart"                "promotion_created"       
##  [7] "promotion_created"        "catalog_(mail)"          
##  [9] "promotion_created"        "catalog_(mail)"          
## [11] "catalog_(mail)"           "promotion_created"

journey length (days)

  • For each lst object : journeys$event_timestamps

  • look at each unique time stamp (ts)

  • For time stamps with atleast 2

  • subtract the last to the first entry and count in seconds

  • otherwise label them NA as they contain less than 2

journey_seconds <- sapply(
  journeys$event_timestamps,
  function(ts) {
    if(length(ts) >= 2) {
      as.numeric(ts[length(ts)] - ts[1], units = "secs")
    } else {
      NA
    }
  }
)
mean_days <- mean(journey_seconds, na.rm=T) /(60*60*24)
mean_days
## [1] 135.4863
  • There are typically about 136 days from start to end of someones journey

  • different because i didnt sample – i used the whole thing

Typical Journey Length (num. actions)

mean(journeys$max_journey_steps_until_end)
## [1] 38.34603
  • typically about 39 actions before someone reaches their last time step

Dist – Journey Length (num. actions)

library(ggplot2)
journeys |> ggplot(aes(max_journey_steps_until_end)) + geom_histogram(bins=50) 

summary(journeys$max_journey_steps_until_end)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   14.00   26.00   38.35   47.00 3299.00
  • exp dist

Normalize :

journeys |> ggplot(aes(max_journey_steps_until_end)) + geom_histogram(bins=50) + 
  scale_x_log10()

  • sort of bi modal dist

Does this differ by successful & unsuccessful journeys?

journeys$order_shipped <- sapply(
  journeys$event_names,
  function(x) "order_shipped" %in% x
)

journeys |>
  group_by(order_shipped) |>
  summarise(
    avg_journey_length = mean(max_journey_steps_until_end),
    median_journey_length = median(max_journey_steps_until_end),
    n = n()
  )
## # A tibble: 2 × 4
##   order_shipped avg_journey_length median_journey_length       n
##   <lgl>                      <dbl>                 <dbl>   <int>
## 1 FALSE                       33.9                    23 1151082
## 2 TRUE                        56.8                    43  279363
  • notice that orders of those whom actually ship and order tends to be dramatically larger.

Amt of each action

Amt_action <- df |> count(event_name) |> arrange(desc(n))
Amt_action
##                                 event_name        n
##                                     <char>    <int>
##  1:                        browse_products 19422197
##  2:                              view_cart  5950083
##  3:                   application_web_view  5860261
##  4:                      promotion_created  5587608
##  5:                            add_to_cart  3694835
##  6:                         begin_checkout  2406324
##  7:                         catalog_(mail)  2235308
##  8:               application_web_approved  1393357
##  9:                  campaignemail_clicked  1377725
## 10:                 application_web_submit  1176459
## 11:                         campaign_click   849847
## 12:                      place_downpayment   372910
## 13:                   account_activitation   357917
## 14:                        place_order_web   298160
## 15:             account_downpaymentcleared   287417
## 16:                          order_shipped   279363
## 17: pre-application_(3rd_party_affiliates)   140792
## 18:                      place_order_phone    58851
## 19:             application_phone_approved    37332
## 20:               application_web_declined    36426
## 21:                      site_registration    16929
## 22:            account_downpaymentreceived     7661
## 23:             application_phone_declined      502
## 24:             catalog_(email)_(experian)      435
## 25:                   fingerhut_university      147
## 26:                application_web_pending       13
## 27:   customer_requested_catalog_(digital)        2
##                                 event_name        n
##                                     <char>    <int>
Amt_action |>
  ggplot(aes(x = reorder(event_name, n), y = n)) +
  geom_col() +
  coord_flip()

Amt_action |> filter(event_name == "order_shipped")
##       event_name      n
##           <char>  <int>
## 1: order_shipped 279363
279363/sum(Amt_action$n) * 100
## [1] 0.5388026
  • 279,363 orders shipped – abt .53
ev_def <- read.csv("Event Definitions.csv")
Amt_action |>
  left_join(ev_def, by = "event_name") |> 
  select(stage, event_name, n)
##                stage                             event_name        n
##               <char>                                 <char>    <int>
##  1:   First Purchase                        browse_products 19422197
##  2:   First Purchase                              view_cart  5950083
##  3: Apply for Credit                   application_web_view  5860261
##  4:             <NA>                      promotion_created  5587608
##  5:   First Purchase                            add_to_cart  3694835
##  6:   First Purchase                         begin_checkout  2406324
##  7:             <NA>                         catalog_(mail)  2235308
##  8: Apply for Credit               application_web_approved  1393357
##  9:             <NA>                  campaignemail_clicked  1377725
## 10: Apply for Credit                 application_web_submit  1176459
## 11:         Discover                         campaign_click   849847
## 12:      Downpayment                      place_downpayment   372910
## 13:   Credit Account                   account_activitation   357917
## 14:   First Purchase                        place_order_web   298160
## 15:      Downpayment             account_downpaymentcleared   287417
## 16:    Order Shipped                          order_shipped   279363
## 17:             <NA> pre-application_(3rd_party_affiliates)   140792
## 18:   First Purchase                      place_order_phone    58851
## 19: Apply for Credit             application_phone_approved    37332
## 20: Apply for Credit               application_web_declined    36426
## 21:         Discover                      site_registration    16929
## 22:      Downpayment            account_downpaymentreceived     7661
## 23: Apply for Credit             application_phone_declined      502
## 24:             <NA>             catalog_(email)_(experian)      435
## 25:         Discover                   fingerhut_university      147
## 26: Apply for Credit                application_web_pending       13
## 27:             <NA>   customer_requested_catalog_(digital)        2
##                stage                             event_name        n
##               <char>                                 <char>    <int>

Typical Time between actions :

gap_days <- unlist(
  lapply(journeys$event_timestamps, function(ts) {

    if(length(ts) >= 2) {

      as.numeric(diff(ts), units = "days")

    } else {

      NULL
    }
  })
)

mean(gap_days) 
## [1] 3.791877
quantile(gap_days, c(.5, .9, .99))
##          50%          90%          99% 
##  0.001018519  8.704791667 70.504438657
hist(gap_days, breaks = 100)

  • very heavily skewwed dist – usually around 0 days
gap_sec <- unlist(
  lapply(journeys$event_timestamps, function(ts) {

    if(length(ts) >= 2) {

      as.numeric(diff(ts), units = "secs")

    } else {

      NULL
    }
  })
)
hist(gap_sec, breaks = 1000)

mean(gap_sec)
## [1] 327618.1
quantile(gap_sec, c(.5, .9, .99))
##     50%     90%     99% 
##      88  752094 6091584
  • Most actions happen very quickly after one another.

  • But a small fraction of massive gaps heavily inflate the mean.

  • About half are about a minute and a half after one another

TLDR :

Task 2

A complete journey would be one where the journey ends with an order shipped. This means that no matter how long a journey takes, as long as it ends with a shipping of an order, it would be considered a successful journey. An failed journey would be a journey that with the cut off date in mind, at least 60 days before that cut off date the journeys did not have any activity. An incomplete journey would be one where the last activity was within the last 60 days that didn’t end with the shipping

cutoff_date <- max(df$event_timestamp) - as.difftime(60, units = "days")

Generate incomplete for journeys

journeys$incomplete <- sapply(
  seq_len(nrow(journeys)),
  function(i) {

    !("order_shipped" %in% journeys$event_names[[i]]) &&
      (max(journeys$event_timestamps[[i]]) >= cutoff_date)

  }
)
table(journeys$incomplete)
## 
##   FALSE    TRUE 
## 1272120  158325
incomplete
False    1272120
True      158325
dtype: int64
  • same values!
training_data <- journeys |>
  filter(incomplete == FALSE)

set.seed(123)

train_df <- training_data |>
  slice_sample(prop = 0.10)

Generate Features :

train_df$will_ship <- sapply(
  train_df$event_names,
  function(x) x[length(x)] == "order_shipped"
)

train_df$n_events <- sapply(train_df$event_names, length)

train_df$n_unique_events <- sapply(
  train_df$event_names,
  function(x) length(unique(x))
)

train_df$journey_duration_days <- sapply(
  train_df$event_timestamps,
  function(x) as.numeric(max(x) - min(x), units = "days")
)

train_df$last_event_is_checkout <- sapply(
  train_df$event_names,
  function(x) as.integer(x[length(x)] == "begin_checkout")
)

train_df$n_add_to_cart <- sapply(
  train_df$event_names,
  function(x) sum(x == "add_to_cart")
)

train_df$n_begin_checkout <- sapply(
  train_df$event_names,
  function(x) sum(x == "begin_checkout")
)

train_df$n_view_cart <- sapply(
  train_df$event_names,
  function(x) sum(x == "view_cart")
)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom        1.0.9     ✔ rsample      1.2.1
## ✔ dials        1.4.1     ✔ tibble       3.2.1
## ✔ infer        1.0.9     ✔ tidyr        1.3.1
## ✔ modeldata    1.5.0     ✔ tune         1.3.0
## ✔ parsnip      1.3.2     ✔ workflows    1.2.0
## ✔ purrr        1.1.0     ✔ workflowsets 1.1.1
## ✔ recipes      1.3.1     ✔ yardstick    1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ rlang:::=()          masks data.table:::=()
## ✖ purrr::%@%()         masks rlang::%@%()
## ✖ dplyr::between()     masks data.table::between()
## ✖ purrr::discard()     masks scales::discard()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ dplyr::first()       masks data.table::first()
## ✖ purrr::flatten()     masks rlang::flatten()
## ✖ purrr::flatten_chr() masks rlang::flatten_chr()
## ✖ purrr::flatten_dbl() masks rlang::flatten_dbl()
## ✖ purrr::flatten_int() masks rlang::flatten_int()
## ✖ purrr::flatten_lgl() masks rlang::flatten_lgl()
## ✖ purrr::flatten_raw() masks rlang::flatten_raw()
## ✖ purrr::invoke()      masks rlang::invoke()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ dplyr::last()        masks data.table::last()
## ✖ rsample::populate()  masks Rcpp::populate()
## ✖ purrr::splice()      masks rlang::splice()
## ✖ recipes::step()      masks stats::step()
## ✖ purrr::transpose()   masks data.table::transpose()
library(ranger)

train_df$will_ship <- factor(train_df$will_ship)

rf_spec <- rand_forest(
  trees = 200
) |>
  set_engine("ranger", importance = "impurity") |>
  set_mode("classification")

rf_fit <- rf_spec |>
  fit(
    will_ship ~ max_journey_steps_until_end +
      n_events +
      n_unique_events +
      journey_duration_days +
      last_event_is_checkout +
      n_add_to_cart +
      n_begin_checkout +
      n_view_cart,
    data = train_df
  )

rf_fit
## parsnip model object
## 
## Ranger result
## 
## Call:
##  ranger::ranger(x = maybe_data_frame(x), y = y, num.trees = ~200,      importance = ~"impurity", num.threads = 1, verbose = FALSE,      seed = sample.int(10^5, 1), probability = TRUE) 
## 
## Type:                             Probability estimation 
## Number of trees:                  200 
## Sample size:                      127212 
## Number of independent variables:  8 
## Mtry:                             2 
## Target node size:                 10 
## Variable importance mode:         impurity 
## Splitrule:                        gini 
## OOB prediction error (Brier s.):  0.02117077
preds <- predict(rf_fit, train_df) |>
  bind_cols(train_df |> select(will_ship))

accuracy(preds, truth = will_ship, estimate = .pred_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.983

Previous scores from old assignment :

OOB accuracy: 0.9737917806496242
Training accuracy: 0.9998899474892305
  • notice we are getting similar values

Variable Importance

library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
vip(rf_fit$fit) +
  ggplot2::labs(
    title = "Variable Importance",
    x = "Importance",
    y = NULL
  )

  • notice we are getting the same features as equally important

Kaggle Prediction

key point from Task 3

getwd()
## [1] "/Users/isaiahmireles/Desktop/M148 GP"
oj <- fread("open_journeys1.csv")

Generate Features

train_df$n_place_downpayment <- sapply(train_df$event_names, function(x) sum(x == "place_downpayment"))

train_df$n_place_order_web <- sapply(train_df$event_names, function(x) sum(x == "place_order_web"))

train_df$n_place_order_phone <- sapply(train_df$event_names, function(x) sum(x == "place_order_phone"))

train_df$last_is_promotion_created <- sapply(
  train_df$event_names,
  function(x) as.integer(x[length(x)] == "promotion_created")
)

train_df$n_promotion_created <- sapply(train_df$event_names, function(x) sum(x == "promotion_created"))

train_df$n_application_web_approved <- sapply(train_df$event_names, function(x) sum(x == "application_web_approved"))

train_df$n_browse_products <- sapply(train_df$event_names, function(x) sum(x == "browse_products"))

train_df$n_application_web_view <- sapply(train_df$event_names, function(x) sum(x == "application_web_view"))

train_df$n_campaign_click <- sapply(train_df$event_names, function(x) sum(x == "campaign_click"))

train_df$n_application_web_submit <- sapply(train_df$event_names, function(x) sum(x == "application_web_submit"))

train_df$n_campaignemail_clicked <- sapply(train_df$event_names, function(x) sum(x == "campaignemail_clicked"))

train_df$last_is_place_order_phone <- sapply(
  train_df$event_names,
  function(x) as.integer(x[length(x)] == "place_order_phone")
)

train_df$last_is_campaign_click <- sapply(
  train_df$event_names,
  function(x) as.integer(x[length(x)] == "campaign_click")
)

train_df$last_is_begin_checkout <- sapply(
  train_df$event_names,
  function(x) as.integer(x[length(x)] == "begin_checkout")
)
library(tidymodels)
library(ranger)
library(ggplot2)

feature_cols <- c(
  "max_journey_steps_until_end",
  "n_place_downpayment",
  "n_place_order_web",
  "n_unique_events",
  "n_place_order_phone",
  "n_begin_checkout",
  "journey_duration_days",
  "last_is_promotion_created",
  "n_add_to_cart",
  "n_promotion_created",
  "n_view_cart",
  "n_events",
  "n_application_web_approved",
  "n_browse_products",
  "n_application_web_view",
  "n_campaign_click",
  "n_application_web_submit",
  "n_campaignemail_clicked",
  "last_is_place_order_phone",
  "last_is_campaign_click",
  "last_is_begin_checkout"
)

train_df$will_ship <- factor(train_df$will_ship)

rf_spec <- rand_forest(
  trees = 150,
  mtry = floor(sqrt(length(feature_cols))),
  min_n = 10
) |>
  set_engine("ranger", importance = "impurity", seed = 42) |>
  set_mode("classification")

rf_fit_new <- rf_spec |>
  fit(
    as.formula(paste("will_ship ~", paste(feature_cols, collapse = " + "))),
    data = train_df
  )

rf_fit_new
## parsnip model object
## 
## Ranger result
## 
## Call:
##  ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~floor(sqrt(length(feature_cols))),      x), num.trees = ~150, min.node.size = min_rows(~10, x), importance = ~"impurity",      seed = ~42, num.threads = 1, verbose = FALSE, probability = TRUE) 
## 
## Type:                             Probability estimation 
## Number of trees:                  150 
## Sample size:                      127212 
## Number of independent variables:  21 
## Mtry:                             4 
## Target node size:                 10 
## Variable importance mode:         impurity 
## Splitrule:                        gini 
## OOB prediction error (Brier s.):  0.009584753
1 - rf_fit_new$fit$prediction.error
## [1] 0.9904152
train_preds <- predict(rf_fit_new, train_df) |>
  bind_cols(train_df |> select(will_ship))

accuracy(
  train_preds,
  truth = will_ship,
  estimate = .pred_class
)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.994

Variable importance

importance_df <- data.frame(
  feature = names(rf_fit_new$fit$variable.importance),
  importance = rf_fit_new$fit$variable.importance
) |>
  arrange(importance)

importance_df |>
  ggplot(aes(x = reorder(feature, importance), y = importance)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Variable Importance",
    x = NULL,
    y = "Importance"
  )

Cross Fold Validation

library(tidymodels)
library(ranger)
library(ggplot2)

set.seed(123)

train_df$will_ship <- factor(train_df$will_ship)

rf_formula <- as.formula(
  paste("will_ship ~", paste(feature_cols, collapse = " + "))
)

folds <- vfold_cv(
  train_df,
  v = 5,
  strata = will_ship
)

rf_spec <- rand_forest(
  trees = 150,
  mtry = floor(sqrt(length(feature_cols))),
  min_n = 10
) |>
  set_engine("ranger", importance = "impurity", seed = 42) |>
  set_mode("classification")

rf_wf <- workflow() |>
  add_model(rf_spec) |>
  add_formula(rf_formula)

rf_cv <- fit_resamples(
  rf_wf,
  resamples = folds,
  metrics = metric_set(accuracy, roc_auc),
  control = control_resamples(save_pred = TRUE)
)
collect_metrics(rf_cv)
## # A tibble: 2 × 6
##   .metric  .estimator  mean     n   std_err .config             
##   <chr>    <chr>      <dbl> <int>     <dbl> <chr>               
## 1 accuracy binary     0.988     5 0.000334  Preprocessor1_Model1
## 2 roc_auc  binary     0.998     5 0.0000742 Preprocessor1_Model1
  • Model level metrics :
collect_metrics(rf_cv, summarize = FALSE)
## # A tibble: 10 × 5
##    id    .metric  .estimator .estimate .config             
##    <chr> <chr>    <chr>          <dbl> <chr>               
##  1 Fold1 accuracy binary         0.989 Preprocessor1_Model1
##  2 Fold1 roc_auc  binary         0.998 Preprocessor1_Model1
##  3 Fold2 accuracy binary         0.987 Preprocessor1_Model1
##  4 Fold2 roc_auc  binary         0.998 Preprocessor1_Model1
##  5 Fold3 accuracy binary         0.989 Preprocessor1_Model1
##  6 Fold3 roc_auc  binary         0.998 Preprocessor1_Model1
##  7 Fold4 accuracy binary         0.988 Preprocessor1_Model1
##  8 Fold4 roc_auc  binary         0.998 Preprocessor1_Model1
##  9 Fold5 accuracy binary         0.989 Preprocessor1_Model1
## 10 Fold5 roc_auc  binary         0.998 Preprocessor1_Model1
  • fold level data ^
collect_metrics(rf_cv) |>
  filter(.metric == "accuracy")
## # A tibble: 1 × 6
##   .metric  .estimator  mean     n  std_err .config             
##   <chr>    <chr>      <dbl> <int>    <dbl> <chr>               
## 1 accuracy binary     0.988     5 0.000334 Preprocessor1_Model1

Specificity

rf_cv <- fit_resamples(
  rf_wf,
  resamples = folds,
  metrics = metric_set(accuracy, roc_auc, sensitivity, specificity),
  control = control_resamples(save_pred = TRUE)
)

collect_metrics(rf_cv)
## # A tibble: 4 × 6
##   .metric     .estimator  mean     n   std_err .config             
##   <chr>       <chr>      <dbl> <int>     <dbl> <chr>               
## 1 accuracy    binary     0.988     5 0.000334  Preprocessor1_Model1
## 2 roc_auc     binary     0.998     5 0.0000742 Preprocessor1_Model1
## 3 sensitivity binary     0.990     5 0.000234  Preprocessor1_Model1
## 4 specificity binary     0.983     5 0.00104   Preprocessor1_Model1
  • Using 5-fold cross-validation, the random forest achieved 98.9% accuracy and 0.998 ROC AUC. Sensitivity and specificity were also high, suggesting strong performance on both shipped and non-shipped journeys. However, because the performance is extremely high, I would be cautious about possible leakage from variables that may encode future journey information, especially max_journey_steps_until_end.
open_df <- journeys |>
  filter(incomplete == TRUE)
add_rf_features <- function(dat) {
  dat$n_events <- sapply(dat$event_names, length)
  dat$n_unique_events <- sapply(dat$event_names, function(x) length(unique(x)))
  dat$journey_duration_days <- sapply(dat$event_timestamps, function(x) {
    as.numeric(max(x) - min(x), units = "days")
  })

  dat$n_place_downpayment <- sapply(dat$event_names, function(x) sum(x == "place_downpayment"))
  dat$n_place_order_web <- sapply(dat$event_names, function(x) sum(x == "place_order_web"))
  dat$n_place_order_phone <- sapply(dat$event_names, function(x) sum(x == "place_order_phone"))
  dat$n_begin_checkout <- sapply(dat$event_names, function(x) sum(x == "begin_checkout"))
  dat$n_add_to_cart <- sapply(dat$event_names, function(x) sum(x == "add_to_cart"))
  dat$n_promotion_created <- sapply(dat$event_names, function(x) sum(x == "promotion_created"))
  dat$n_view_cart <- sapply(dat$event_names, function(x) sum(x == "view_cart"))
  dat$n_application_web_approved <- sapply(dat$event_names, function(x) sum(x == "application_web_approved"))
  dat$n_browse_products <- sapply(dat$event_names, function(x) sum(x == "browse_products"))
  dat$n_application_web_view <- sapply(dat$event_names, function(x) sum(x == "application_web_view"))
  dat$n_campaign_click <- sapply(dat$event_names, function(x) sum(x == "campaign_click"))
  dat$n_application_web_submit <- sapply(dat$event_names, function(x) sum(x == "application_web_submit"))
  dat$n_campaignemail_clicked <- sapply(dat$event_names, function(x) sum(x == "campaignemail_clicked"))

  dat$last_is_promotion_created <- sapply(dat$event_names, function(x) as.integer(x[length(x)] == "promotion_created"))
  dat$last_is_place_order_phone <- sapply(dat$event_names, function(x) as.integer(x[length(x)] == "place_order_phone"))
  dat$last_is_campaign_click <- sapply(dat$event_names, function(x) as.integer(x[length(x)] == "campaign_click"))
  dat$last_is_begin_checkout <- sapply(dat$event_names, function(x) as.integer(x[length(x)] == "begin_checkout"))

  dat
}

open_df <- add_rf_features(open_df)
pred_probs <- predict(rf_fit_new, open_df, type = "prob")
pred_class <- predict(rf_fit_new, open_df)

pred <- open_df |>
  select(customer_id, account_id) |>
  bind_cols(pred_probs, pred_class) |>
  mutate(
    id = paste(customer_id, account_id),
    ship_probability = .pred_TRUE,
    predicted_order_shipped = as.integer(.pred_class == "TRUE")
  ) |>
  select(id, ship_probability, predicted_order_shipped)
success_rate <- mean(pred$predicted_order_shipped)
failure_rate <- 1 - success_rate

success_rate
## [1] 0.01348492
failure_rate
## [1] 0.9865151

Kaggle Submission

oj <- fread("open_journeys1.csv")

oj$date <- as.Date(oj$event_timestamp)

oj$dt_tm <- as.POSIXct(
  oj$event_timestamp,
  format = "%Y-%m-%dT%H:%M:%SZ",
  tz = "UTC"
)

oj <- oj |>
  distinct(id, dt_tm, event_name, .keep_all = TRUE)

open_journeys <- oj |>
  arrange(id, dt_tm) |>
  group_by(id) |>
  summarise(
    event_names = list(event_name),
    event_timestamps = list(dt_tm),
    max_journey_steps_until_end = max(journey_steps_until_end),
    .groups = "drop"
  )
open_journeys <- add_rf_features(open_journeys)
pred_probs <- predict(
  rf_fit_new,
  open_journeys,
  type = "prob"
)
submission <- open_journeys |>
  bind_cols(pred_probs) |>
  transmute(
    id,
    order_shipped = .pred_TRUE
  )

write.csv(submission, “k_submission.csv”, row.names = FALSE )

sample_sub <- fread("open_journeys1_flattened_all0.csv")
nrow(submission)
## [1] 158325
nrow(sample_sub)
## [1] 158325
all(submission$id == sample_sub$id)
## [1] TRUE