setwd("/Users/isaiahmireles/Desktop/M148 GP")
library(data.table)
DT <- fread("clean_dat.csv")
Create a time series for the number of orders shipped. Compute the number of orders shipped per day, week, and month.
# Keep only rows with order shipped
DT_ship <- DT[event_name == "order_shipped"]
nrow(DT_ship)
## [1] 279363
library(dplyr)
# early - late
DT_ship <-
DT_ship |> arrange(dt_tm)
day_agg <- DT_ship |> group_by(date) |> count(event_name)
plot(day_agg$date, day_agg$n)
range(day_agg$n)
## [1] 1 1619
?summarise
library(lubridate)
monthly_agg <-
DT_ship |>
mutate(month = floor_date(date, "month")) |>
group_by(month) |>
count(event_name)
plot(monthly_agg$month, monthly_agg$n)
?floor_date
yr_agg <-
DT_ship |>
mutate(year = floor_date(date, "year")) |>
group_by(year) |>
count(event_name)
plot(yr_agg$year, yr_agg$n)
setwd("/Users/isaiahmireles/Desktop/M148 GP")
dt <- fread("dat_train1.csv")
dt2 <- fread("clean_dat.csv")
Thomas Feedback:
# just want to summarize data for total orders shipped per-day
daily_data <- dt[, .(
y = sum(event_name=="order_shipped")
), by = .(ds = as.IDate(event_timestamp))]
# first couple weeks are all ZERO! in-progress orders were excluded
# need to find a later start point - get avg length of successful journey and take pctile
# any duplicate rows?
# anyDuplicated(prophet_train_data$ds)
prophet_train_data <- dt[
event_name == "order_shipped",
.(ds = as.IDate(event_timestamp))] |>
_[, .(y = .N), by = .(ds = round(ds, "months"))]
# drop incomplete rows
train_cutoff_date <- as.IDate(max(dt$event_timestamp))
# flatten data by user
user_dt <- dt[, .(
first_action_date = as.IDate(min(event_timestamp)),
last_action_date = as.IDate(max(event_timestamp)),
has_shipped = any(event_name == "order_shipped"),
total_actions = .N
), by = id]
# define days since last action as days_inactive
user_dt[, days_inactive := as.integer(train_cutoff_date - last_action_date)]
user_dt[, journey_outcome := fcase(
has_shipped == TRUE, "success",
has_shipped == FALSE & days_inactive > 60, "failure",
default = "incomplete"
)]
table(user_dt$journey_outcome)
##
## failure incomplete success
## 992129 158953 279363
# drop incomplete journeys
user_dt_complete <- user_dt[journey_outcome != "incomplete", ]
# want to find distr. journey length for a successes and add upper end to cutoff date to reduce survivorship bias
successes_only <- user_dt_complete[journey_outcome == "success"]
purchase_times <- dt[event_name == "order_shipped",
.(purchase_date = as.IDate(min(event_timestamp))),
by = id]
# join back to find the journey length
journey_lengths <- merge(successes_only, purchase_times, by = "id")
journey_lengths[, days_to_buy := as.integer(purchase_date - first_action_date)]
hist(journey_lengths$days_to_buy, breaks = 100, freq=FALSE)
# find the 95th percentile of how long it takes people to buy
p95_days_success <- quantile(journey_lengths$days_to_buy, probs = 0.95, na.rm = TRUE)
print(paste("95% of successful users buy within", round(p95_days_success), "days"))
## [1] "95% of successful users buy within 221 days"
p80_days_success <- quantile(journey_lengths$days_to_buy, probs = 0.80, na.rm = TRUE)
# define safe cohort start date
# Safe Date = Cutoff - (95th Percentile Days + 60 Days Inactivity)
safe_cutoff <- train_cutoff_date - (round(p95_days_success) + 60)
cohort_users <- user_dt_complete[first_action_date <= safe_cutoff]
prophet_train_data <- cohort_users[, .(
total_users = .N,
successes = sum(journey_outcome == "success"),
y = sum(journey_outcome == "success") / .N # Prophet requires the target to be named 'y'
), by = .(ds = first_action_date)] # Prophet requires the date to be named 'ds'
library(prophet)
## Warning: package 'prophet' was built under R version 4.4.3
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 4.4.3
## Loading required package: rlang
##
## Attaching package: 'rlang'
## The following object is masked from 'package:data.table':
##
## :=
# Initialize Prophet for daily data
m <- prophet(
yearly.seasonality = TRUE,
weekly.seasonality = TRUE,
daily.seasonality = FALSE
)
m <- add_country_holidays(m, country_name = "US")
m <- fit.prophet(m, prophet_train_data)
# forecast through end of 2024:
# calculate days needed to reach Dec 2024
last_ds <- max(prophet_train_data$ds)
n_days <- length(seq(last_ds, as.IDate("2024-12-01"), by = "day")) - 1
# Create future dataframe at daily frequency
future_daily <- make_future_dataframe(m, periods = max(0, n_days), freq = "day")
forecast_daily <- predict(m, future_daily)
library(prophet)
# Initialize Prophet for monthly data
m <- prophet(
yearly.seasonality = TRUE,
weekly.seasonality = FALSE,
daily.seasonality = FALSE
)
m <- fit.prophet(m, prophet_train_data)
# Calculate months needed to reach Dec 2024
last_ds <- max(prophet_train_data$ds)
n_months <- length(seq(last_ds, as.IDate("2024-12-01"), by = "month")) - 1
# Create future dataframe at monthly frequency
future_monthly <- make_future_dataframe(m, periods = max(0, n_months), freq = "month")
forecast_monthly <- predict(m, future_monthly)
library(prophet)
m <- prophet(yearly.seasonality = TRUE, weekly.seasonality=TRUE)
m <- add_country_holidays(m, country_name = "US")
m <- fit.prophet(m, prophet_train_data)
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
# calculate exactly how many days of "incomplete" cohorts we need to forecast
days_to_forecast <- as.numeric(train_cutoff_date - safe_cutoff)
# create a dataframe containing both historical dates AND future dates
future_dates <- make_future_dataframe(m, periods = days_to_forecast)
# generate the forecast
# This creates a massive dataframe containing the prediction (yhat) and all components
forecast <- predict(m, future_dates)
library(ggplot2)
library(dplyr)
library(patchwork)
plot(m, forecast_daily) +
labs(title = "Daily Orders Forecast through 2024", x = "Date", y = "Orders Shipped") +
theme_minimal()
# ggsave("prophet-plots/daily_orders_forecast.png", width = 8, height = 4)
# components
comp_plot_list <- prophet_plot_components(m, forecast_daily)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the prophet package.
## Please report the issue at <https://github.com/facebook/prophet/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
wrap_plots(comp_plot_list, ncol = 1, nrow=4) +
theme_minimal()
# ggsave("prophet-plots/daily_orders_components.png", width = 8, height = 5)
yoy_holiday_data <- forecast_daily %>%
select(ds, holidays) %>%
mutate(
# Extract the actual year to use as our color category
Year = as.factor(year(ds)),
# Force every date to occur in 2024 (a leap year to protect Feb 29)
# This aligns 2022, 2023, and 2024 exactly on top of each other
dummy_date = update(ds, year = 2023)
)
ggplot(yoy_holiday_data, aes(x = dummy_date, y = holidays, color = Year)) +
# Baseline zero line
geom_hline(yintercept = 0, color = "darkgray", linetype = "dashed") +
# Draw the lines (alpha = 0.7 makes overlapping years easier to see through)
geom_line(linewidth = .5, alpha = 0.5) +
# # Add dots only on the actual holidays
# geom_point(data = yoy_holiday_data %>% filter(holidays != 0), size = 1) +
# Force the X-axis to label every single month using the dummy dates
scale_x_datetime(date_breaks = "1 month", date_labels = "%b") +
scale_color_brewer() +
labs(
title = "Year-Over-Year Holiday Effect on Orders Shipped",
subtitle = "Baseline effect during US National Holidays",
x = "Month",
y = "Holiday Effect",
color = "Year"
) +
theme_minimal()
library(ggplot2)
library(dplyr)
# Visualize monthly forecast
plot(m, forecast_monthly) +
labs(title = "Monthly Orders Forecast through 2024", x = "Month", y = "Orders Shipped") +
theme_minimal()
# Plot 1: The Main Forecast
plot(m, forecast) +
ggtitle("FingerHut Cohort Success Rate Forecast") +
xlab("Cohort Start Date") +
ylab("Baseline Success Proportion") +
theme_minimal()
# Plot 2: The Components Breakdown
prophet_plot_components(m, forecast)
# plot 3: zoom in on holiday year-over-year
yoy_holiday_data <- forecast %>%
select(ds, holidays) %>%
mutate(
# Extract the actual year to use as our color category
Year = as.factor(year(ds)),
# Force every date to occur in 2024 (a leap year to protect Feb 29)
# This aligns 2022, 2023, and 2024 exactly on top of each other
dummy_date = update(ds, year = 2023)
)
ggplot(yoy_holiday_data, aes(x = dummy_date, y = holidays, color = Year)) +
# Baseline zero line
geom_hline(yintercept = 0, color = "darkgray", linetype = "dashed") +
# Draw the lines (alpha = 0.7 makes overlapping years easier to see through)
geom_line(linewidth = .5, alpha = 0.7) +
# Add dots only on the actual holidays
geom_point(data = yoy_holiday_data %>% filter(holidays != 0), size = 1) +
# Force the X-axis to label every single month using the dummy dates
scale_x_datetime(date_breaks = "1 month", date_labels = "%b") +
labs(
title = "Year-Over-Year Holiday Effect on Cohort Quality",
subtitle = "Baseline success rate multiplier during US National Holidays",
x = "Month",
y = "Holiday Effect",
color = "Cohort Year"
) +
theme_minimal()
dt3 <- fread("dat_train1.csv")
dt3$date <- as.Date(dt3$event_timestamp) #only dt not time -- dt obj
dt3$dt_tm <- as.POSIXct(dt3$event_timestamp, format = "%Y-%m-%dT%H:%M:%SZ", tz = "UTC")
range(dt3$date)
## [1] "2020-11-03" "2023-01-23"
range(dt3$dt_tm)
## [1] "2020-11-03 03:31:30 UTC" "2023-01-23 12:29:56 UTC"
\[1\] “2020-11-03” “2023-01-23”
\[1\] “2020-11-03 03:31:30 UTC” “2023-01-23 12:29:56 UTC”
length(unique(dt3$customer_id))
## [1] 1391421
length(unique(dt3$account_id))
## [1] 1430230
length(unique(dt3$id))
## [1] 1430445
df <- dt3
library(dplyr)
multi_customers_per_account <- df %>%
distinct(customer_id, account_id) %>%
group_by(account_id) %>%
summarise(n_customers = n()) %>%
filter(n_customers > 1)
# count how many such accounts
nrow(multi_customers_per_account)
## [1] 215
# relative importance : negligible issue : frac of pct -- abt 0%
round(nrow(multi_customers_per_account)/df %>%
distinct(customer_id, account_id) |> nrow(), 5)
## [1] 0.00015
multi_accts_per_customer <- df %>%
distinct(customer_id, account_id) %>% # remove duplicate rows first
group_by(customer_id) %>%
summarise(n_accounts = n()) %>%
filter(n_accounts > 1)
# count how many such customers
nrow(multi_accts_per_customer)
## [1] 38757
# relative importance : almost negligible issue : 3%
round(nrow(multi_accts_per_customer)/df |>
distinct(customer_id, account_id) |> nrow(), 5)
## [1] 0.02709
df <- df %>%
distinct(id, dt_tm, event_name, .keep_all = TRUE)
nrow(df)
## [1] 51848861
df %>% summarize(count = n_distinct(customer_id, account_id))
## count
## 1 1430445
journeys <- df %>%
arrange(customer_id, account_id, dt_tm) %>%
group_by(customer_id, account_id) %>%
summarise(
event_names = list(event_name),
event_timestamps = list(dt_tm),
max_journey_steps_until_end = max(journey_steps_until_end),
.groups = "drop"
)
This is the sequence of events ordered by dt_tm of a particular customer_id, account_id pair?
journeys$event_names[1:3]
## [[1]]
## [1] "application_web_approved" "view_cart"
## [3] "browse_products" "add_to_cart"
## [5] "view_cart" "begin_checkout"
## [7] "promotion_created" "browse_products"
## [9] "add_to_cart" "browse_products"
## [11] "add_to_cart" "view_cart"
## [13] "begin_checkout" "view_cart"
## [15] "browse_products" "add_to_cart"
## [17] "view_cart" "begin_checkout"
## [19] "promotion_created" "catalog_(mail)"
## [21] "promotion_created" "catalog_(mail)"
## [23] "promotion_created" "promotion_created"
## [25] "catalog_(mail)" "promotion_created"
## [27] "promotion_created" "promotion_created"
## [29] "promotion_created"
##
## [[2]]
## [1] "begin_checkout" "application_web_view"
## [3] "application_web_view" "application_web_view"
## [5] "application_web_view" "application_web_view"
## [7] "application_web_view" "application_web_view"
## [9] "promotion_created" "browse_products"
## [11] "application_web_view" "application_web_view"
## [13] "application_web_view" "application_web_view"
## [15] "application_web_submit" "application_web_view"
## [17] "application_web_approved" "view_cart"
## [19] "begin_checkout" "view_cart"
## [21] "begin_checkout" "view_cart"
## [23] "promotion_created" "catalog_(mail)"
## [25] "promotion_created" "promotion_created"
## [27] "catalog_(mail)" "promotion_created"
## [29] "promotion_created" "catalog_(mail)"
## [31] "promotion_created" "catalog_(mail)"
##
## [[3]]
## [1] "application_web_approved" "promotion_created"
## [3] "browse_products" "add_to_cart"
## [5] "view_cart" "promotion_created"
## [7] "promotion_created" "catalog_(mail)"
## [9] "promotion_created" "catalog_(mail)"
## [11] "catalog_(mail)" "promotion_created"
For each lst object : journeys$event_timestamps
look at each unique time stamp (ts)
For time stamps with atleast 2
subtract the last to the first entry and count in seconds
otherwise label them NA as they contain less than 2
journey_seconds <- sapply(
journeys$event_timestamps,
function(ts) {
if(length(ts) >= 2) {
as.numeric(ts[length(ts)] - ts[1], units = "secs")
} else {
NA
}
}
)
mean_days <- mean(journey_seconds, na.rm=T) /(60*60*24)
mean_days
## [1] 135.4863
There are typically about 136 days from start to end of someones journey
different because i didnt sample – i used the whole thing
mean(journeys$max_journey_steps_until_end)
## [1] 38.34603
library(ggplot2)
journeys |> ggplot(aes(max_journey_steps_until_end)) + geom_histogram(bins=50)
summary(journeys$max_journey_steps_until_end)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 14.00 26.00 38.35 47.00 3299.00
journeys |> ggplot(aes(max_journey_steps_until_end)) + geom_histogram(bins=50) +
scale_x_log10()
journeys$order_shipped <- sapply(
journeys$event_names,
function(x) "order_shipped" %in% x
)
journeys |>
group_by(order_shipped) |>
summarise(
avg_journey_length = mean(max_journey_steps_until_end),
median_journey_length = median(max_journey_steps_until_end),
n = n()
)
## # A tibble: 2 × 4
## order_shipped avg_journey_length median_journey_length n
## <lgl> <dbl> <dbl> <int>
## 1 FALSE 33.9 23 1151082
## 2 TRUE 56.8 43 279363
Amt_action <- df |> count(event_name) |> arrange(desc(n))
Amt_action
## event_name n
## <char> <int>
## 1: browse_products 19422197
## 2: view_cart 5950083
## 3: application_web_view 5860261
## 4: promotion_created 5587608
## 5: add_to_cart 3694835
## 6: begin_checkout 2406324
## 7: catalog_(mail) 2235308
## 8: application_web_approved 1393357
## 9: campaignemail_clicked 1377725
## 10: application_web_submit 1176459
## 11: campaign_click 849847
## 12: place_downpayment 372910
## 13: account_activitation 357917
## 14: place_order_web 298160
## 15: account_downpaymentcleared 287417
## 16: order_shipped 279363
## 17: pre-application_(3rd_party_affiliates) 140792
## 18: place_order_phone 58851
## 19: application_phone_approved 37332
## 20: application_web_declined 36426
## 21: site_registration 16929
## 22: account_downpaymentreceived 7661
## 23: application_phone_declined 502
## 24: catalog_(email)_(experian) 435
## 25: fingerhut_university 147
## 26: application_web_pending 13
## 27: customer_requested_catalog_(digital) 2
## event_name n
## <char> <int>
Amt_action |>
ggplot(aes(x = reorder(event_name, n), y = n)) +
geom_col() +
coord_flip()
Amt_action |> filter(event_name == "order_shipped")
## event_name n
## <char> <int>
## 1: order_shipped 279363
279363/sum(Amt_action$n) * 100
## [1] 0.5388026
ev_def <- read.csv("Event Definitions.csv")
Amt_action |>
left_join(ev_def, by = "event_name") |>
select(stage, event_name, n)
## stage event_name n
## <char> <char> <int>
## 1: First Purchase browse_products 19422197
## 2: First Purchase view_cart 5950083
## 3: Apply for Credit application_web_view 5860261
## 4: <NA> promotion_created 5587608
## 5: First Purchase add_to_cart 3694835
## 6: First Purchase begin_checkout 2406324
## 7: <NA> catalog_(mail) 2235308
## 8: Apply for Credit application_web_approved 1393357
## 9: <NA> campaignemail_clicked 1377725
## 10: Apply for Credit application_web_submit 1176459
## 11: Discover campaign_click 849847
## 12: Downpayment place_downpayment 372910
## 13: Credit Account account_activitation 357917
## 14: First Purchase place_order_web 298160
## 15: Downpayment account_downpaymentcleared 287417
## 16: Order Shipped order_shipped 279363
## 17: <NA> pre-application_(3rd_party_affiliates) 140792
## 18: First Purchase place_order_phone 58851
## 19: Apply for Credit application_phone_approved 37332
## 20: Apply for Credit application_web_declined 36426
## 21: Discover site_registration 16929
## 22: Downpayment account_downpaymentreceived 7661
## 23: Apply for Credit application_phone_declined 502
## 24: <NA> catalog_(email)_(experian) 435
## 25: Discover fingerhut_university 147
## 26: Apply for Credit application_web_pending 13
## 27: <NA> customer_requested_catalog_(digital) 2
## stage event_name n
## <char> <char> <int>
gap_days <- unlist(
lapply(journeys$event_timestamps, function(ts) {
if(length(ts) >= 2) {
as.numeric(diff(ts), units = "days")
} else {
NULL
}
})
)
mean(gap_days)
## [1] 3.791877
quantile(gap_days, c(.5, .9, .99))
## 50% 90% 99%
## 0.001018519 8.704791667 70.504438657
hist(gap_days, breaks = 100)
gap_sec <- unlist(
lapply(journeys$event_timestamps, function(ts) {
if(length(ts) >= 2) {
as.numeric(diff(ts), units = "secs")
} else {
NULL
}
})
)
hist(gap_sec, breaks = 1000)
mean(gap_sec)
## [1] 327618.1
quantile(gap_sec, c(.5, .9, .99))
## 50% 90% 99%
## 88 752094 6091584
Most actions happen very quickly after one another.
But a small fraction of massive gaps heavily inflate the mean.
About half are about a minute and a half after one another
Those whom ship an order are usually on website for longer
browse_products, view_cart, application_web_view, promotion_created, add_to_cart, begin_checkout are common user actions
Shipping an order makes up a small portion of typical user actions
The typical duration of a
There are typically about 136 days from start to end of someones journey
actions happen very quickly after one another.
A complete journey would be one where the journey ends with an order shipped. This means that no matter how long a journey takes, as long as it ends with a shipping of an order, it would be considered a successful journey. An failed journey would be a journey that with the cut off date in mind, at least 60 days before that cut off date the journeys did not have any activity. An incomplete journey would be one where the last activity was within the last 60 days that didn’t end with the shipping
cutoff_date <- max(df$event_timestamp) - as.difftime(60, units = "days")
journeys$incomplete <- sapply(
seq_len(nrow(journeys)),
function(i) {
!("order_shipped" %in% journeys$event_names[[i]]) &&
(max(journeys$event_timestamps[[i]]) >= cutoff_date)
}
)
table(journeys$incomplete)
##
## FALSE TRUE
## 1272120 158325
incomplete
False 1272120
True 158325
dtype: int64
training_data <- journeys |>
filter(incomplete == FALSE)
set.seed(123)
train_df <- training_data |>
slice_sample(prop = 0.10)
train_df$will_ship <- sapply(
train_df$event_names,
function(x) x[length(x)] == "order_shipped"
)
train_df$n_events <- sapply(train_df$event_names, length)
train_df$n_unique_events <- sapply(
train_df$event_names,
function(x) length(unique(x))
)
train_df$journey_duration_days <- sapply(
train_df$event_timestamps,
function(x) as.numeric(max(x) - min(x), units = "days")
)
train_df$last_event_is_checkout <- sapply(
train_df$event_names,
function(x) as.integer(x[length(x)] == "begin_checkout")
)
train_df$n_add_to_cart <- sapply(
train_df$event_names,
function(x) sum(x == "add_to_cart")
)
train_df$n_begin_checkout <- sapply(
train_df$event_names,
function(x) sum(x == "begin_checkout")
)
train_df$n_view_cart <- sapply(
train_df$event_names,
function(x) sum(x == "view_cart")
)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom 1.0.9 ✔ rsample 1.2.1
## ✔ dials 1.4.1 ✔ tibble 3.2.1
## ✔ infer 1.0.9 ✔ tidyr 1.3.1
## ✔ modeldata 1.5.0 ✔ tune 1.3.0
## ✔ parsnip 1.3.2 ✔ workflows 1.2.0
## ✔ purrr 1.1.0 ✔ workflowsets 1.1.1
## ✔ recipes 1.3.1 ✔ yardstick 1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ rlang:::=() masks data.table:::=()
## ✖ purrr::%@%() masks rlang::%@%()
## ✖ dplyr::between() masks data.table::between()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ purrr::flatten() masks rlang::flatten()
## ✖ purrr::flatten_chr() masks rlang::flatten_chr()
## ✖ purrr::flatten_dbl() masks rlang::flatten_dbl()
## ✖ purrr::flatten_int() masks rlang::flatten_int()
## ✖ purrr::flatten_lgl() masks rlang::flatten_lgl()
## ✖ purrr::flatten_raw() masks rlang::flatten_raw()
## ✖ purrr::invoke() masks rlang::invoke()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ rsample::populate() masks Rcpp::populate()
## ✖ purrr::splice() masks rlang::splice()
## ✖ recipes::step() masks stats::step()
## ✖ purrr::transpose() masks data.table::transpose()
library(ranger)
train_df$will_ship <- factor(train_df$will_ship)
rf_spec <- rand_forest(
trees = 200
) |>
set_engine("ranger", importance = "impurity") |>
set_mode("classification")
rf_fit <- rf_spec |>
fit(
will_ship ~ max_journey_steps_until_end +
n_events +
n_unique_events +
journey_duration_days +
last_event_is_checkout +
n_add_to_cart +
n_begin_checkout +
n_view_cart,
data = train_df
)
rf_fit
## parsnip model object
##
## Ranger result
##
## Call:
## ranger::ranger(x = maybe_data_frame(x), y = y, num.trees = ~200, importance = ~"impurity", num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE)
##
## Type: Probability estimation
## Number of trees: 200
## Sample size: 127212
## Number of independent variables: 8
## Mtry: 2
## Target node size: 10
## Variable importance mode: impurity
## Splitrule: gini
## OOB prediction error (Brier s.): 0.02117077
preds <- predict(rf_fit, train_df) |>
bind_cols(train_df |> select(will_ship))
accuracy(preds, truth = will_ship, estimate = .pred_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.983
Previous scores from old assignment :
OOB accuracy: 0.9737917806496242
Training accuracy: 0.9998899474892305
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
vip(rf_fit$fit) +
ggplot2::labs(
title = "Variable Importance",
x = "Importance",
y = NULL
)
key point from Task 3
getwd()
## [1] "/Users/isaiahmireles/Desktop/M148 GP"
oj <- fread("open_journeys1.csv")
train_df$n_place_downpayment <- sapply(train_df$event_names, function(x) sum(x == "place_downpayment"))
train_df$n_place_order_web <- sapply(train_df$event_names, function(x) sum(x == "place_order_web"))
train_df$n_place_order_phone <- sapply(train_df$event_names, function(x) sum(x == "place_order_phone"))
train_df$last_is_promotion_created <- sapply(
train_df$event_names,
function(x) as.integer(x[length(x)] == "promotion_created")
)
train_df$n_promotion_created <- sapply(train_df$event_names, function(x) sum(x == "promotion_created"))
train_df$n_application_web_approved <- sapply(train_df$event_names, function(x) sum(x == "application_web_approved"))
train_df$n_browse_products <- sapply(train_df$event_names, function(x) sum(x == "browse_products"))
train_df$n_application_web_view <- sapply(train_df$event_names, function(x) sum(x == "application_web_view"))
train_df$n_campaign_click <- sapply(train_df$event_names, function(x) sum(x == "campaign_click"))
train_df$n_application_web_submit <- sapply(train_df$event_names, function(x) sum(x == "application_web_submit"))
train_df$n_campaignemail_clicked <- sapply(train_df$event_names, function(x) sum(x == "campaignemail_clicked"))
train_df$last_is_place_order_phone <- sapply(
train_df$event_names,
function(x) as.integer(x[length(x)] == "place_order_phone")
)
train_df$last_is_campaign_click <- sapply(
train_df$event_names,
function(x) as.integer(x[length(x)] == "campaign_click")
)
train_df$last_is_begin_checkout <- sapply(
train_df$event_names,
function(x) as.integer(x[length(x)] == "begin_checkout")
)
library(tidymodels)
library(ranger)
library(ggplot2)
feature_cols <- c(
"max_journey_steps_until_end",
"n_place_downpayment",
"n_place_order_web",
"n_unique_events",
"n_place_order_phone",
"n_begin_checkout",
"journey_duration_days",
"last_is_promotion_created",
"n_add_to_cart",
"n_promotion_created",
"n_view_cart",
"n_events",
"n_application_web_approved",
"n_browse_products",
"n_application_web_view",
"n_campaign_click",
"n_application_web_submit",
"n_campaignemail_clicked",
"last_is_place_order_phone",
"last_is_campaign_click",
"last_is_begin_checkout"
)
train_df$will_ship <- factor(train_df$will_ship)
rf_spec <- rand_forest(
trees = 150,
mtry = floor(sqrt(length(feature_cols))),
min_n = 10
) |>
set_engine("ranger", importance = "impurity", seed = 42) |>
set_mode("classification")
rf_fit_new <- rf_spec |>
fit(
as.formula(paste("will_ship ~", paste(feature_cols, collapse = " + "))),
data = train_df
)
rf_fit_new
## parsnip model object
##
## Ranger result
##
## Call:
## ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~floor(sqrt(length(feature_cols))), x), num.trees = ~150, min.node.size = min_rows(~10, x), importance = ~"impurity", seed = ~42, num.threads = 1, verbose = FALSE, probability = TRUE)
##
## Type: Probability estimation
## Number of trees: 150
## Sample size: 127212
## Number of independent variables: 21
## Mtry: 4
## Target node size: 10
## Variable importance mode: impurity
## Splitrule: gini
## OOB prediction error (Brier s.): 0.009584753
1 - rf_fit_new$fit$prediction.error
## [1] 0.9904152
train_preds <- predict(rf_fit_new, train_df) |>
bind_cols(train_df |> select(will_ship))
accuracy(
train_preds,
truth = will_ship,
estimate = .pred_class
)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.994
importance_df <- data.frame(
feature = names(rf_fit_new$fit$variable.importance),
importance = rf_fit_new$fit$variable.importance
) |>
arrange(importance)
importance_df |>
ggplot(aes(x = reorder(feature, importance), y = importance)) +
geom_col() +
coord_flip() +
labs(
title = "Variable Importance",
x = NULL,
y = "Importance"
)
library(tidymodels)
library(ranger)
library(ggplot2)
set.seed(123)
train_df$will_ship <- factor(train_df$will_ship)
rf_formula <- as.formula(
paste("will_ship ~", paste(feature_cols, collapse = " + "))
)
folds <- vfold_cv(
train_df,
v = 5,
strata = will_ship
)
rf_spec <- rand_forest(
trees = 150,
mtry = floor(sqrt(length(feature_cols))),
min_n = 10
) |>
set_engine("ranger", importance = "impurity", seed = 42) |>
set_mode("classification")
rf_wf <- workflow() |>
add_model(rf_spec) |>
add_formula(rf_formula)
rf_cv <- fit_resamples(
rf_wf,
resamples = folds,
metrics = metric_set(accuracy, roc_auc),
control = control_resamples(save_pred = TRUE)
)
collect_metrics(rf_cv)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.988 5 0.000334 Preprocessor1_Model1
## 2 roc_auc binary 0.998 5 0.0000742 Preprocessor1_Model1
collect_metrics(rf_cv, summarize = FALSE)
## # A tibble: 10 × 5
## id .metric .estimator .estimate .config
## <chr> <chr> <chr> <dbl> <chr>
## 1 Fold1 accuracy binary 0.989 Preprocessor1_Model1
## 2 Fold1 roc_auc binary 0.998 Preprocessor1_Model1
## 3 Fold2 accuracy binary 0.987 Preprocessor1_Model1
## 4 Fold2 roc_auc binary 0.998 Preprocessor1_Model1
## 5 Fold3 accuracy binary 0.989 Preprocessor1_Model1
## 6 Fold3 roc_auc binary 0.998 Preprocessor1_Model1
## 7 Fold4 accuracy binary 0.988 Preprocessor1_Model1
## 8 Fold4 roc_auc binary 0.998 Preprocessor1_Model1
## 9 Fold5 accuracy binary 0.989 Preprocessor1_Model1
## 10 Fold5 roc_auc binary 0.998 Preprocessor1_Model1
collect_metrics(rf_cv) |>
filter(.metric == "accuracy")
## # A tibble: 1 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.988 5 0.000334 Preprocessor1_Model1
rf_cv <- fit_resamples(
rf_wf,
resamples = folds,
metrics = metric_set(accuracy, roc_auc, sensitivity, specificity),
control = control_resamples(save_pred = TRUE)
)
collect_metrics(rf_cv)
## # A tibble: 4 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.988 5 0.000334 Preprocessor1_Model1
## 2 roc_auc binary 0.998 5 0.0000742 Preprocessor1_Model1
## 3 sensitivity binary 0.990 5 0.000234 Preprocessor1_Model1
## 4 specificity binary 0.983 5 0.00104 Preprocessor1_Model1
open_df <- journeys |>
filter(incomplete == TRUE)
add_rf_features <- function(dat) {
dat$n_events <- sapply(dat$event_names, length)
dat$n_unique_events <- sapply(dat$event_names, function(x) length(unique(x)))
dat$journey_duration_days <- sapply(dat$event_timestamps, function(x) {
as.numeric(max(x) - min(x), units = "days")
})
dat$n_place_downpayment <- sapply(dat$event_names, function(x) sum(x == "place_downpayment"))
dat$n_place_order_web <- sapply(dat$event_names, function(x) sum(x == "place_order_web"))
dat$n_place_order_phone <- sapply(dat$event_names, function(x) sum(x == "place_order_phone"))
dat$n_begin_checkout <- sapply(dat$event_names, function(x) sum(x == "begin_checkout"))
dat$n_add_to_cart <- sapply(dat$event_names, function(x) sum(x == "add_to_cart"))
dat$n_promotion_created <- sapply(dat$event_names, function(x) sum(x == "promotion_created"))
dat$n_view_cart <- sapply(dat$event_names, function(x) sum(x == "view_cart"))
dat$n_application_web_approved <- sapply(dat$event_names, function(x) sum(x == "application_web_approved"))
dat$n_browse_products <- sapply(dat$event_names, function(x) sum(x == "browse_products"))
dat$n_application_web_view <- sapply(dat$event_names, function(x) sum(x == "application_web_view"))
dat$n_campaign_click <- sapply(dat$event_names, function(x) sum(x == "campaign_click"))
dat$n_application_web_submit <- sapply(dat$event_names, function(x) sum(x == "application_web_submit"))
dat$n_campaignemail_clicked <- sapply(dat$event_names, function(x) sum(x == "campaignemail_clicked"))
dat$last_is_promotion_created <- sapply(dat$event_names, function(x) as.integer(x[length(x)] == "promotion_created"))
dat$last_is_place_order_phone <- sapply(dat$event_names, function(x) as.integer(x[length(x)] == "place_order_phone"))
dat$last_is_campaign_click <- sapply(dat$event_names, function(x) as.integer(x[length(x)] == "campaign_click"))
dat$last_is_begin_checkout <- sapply(dat$event_names, function(x) as.integer(x[length(x)] == "begin_checkout"))
dat
}
open_df <- add_rf_features(open_df)
pred_probs <- predict(rf_fit_new, open_df, type = "prob")
pred_class <- predict(rf_fit_new, open_df)
pred <- open_df |>
select(customer_id, account_id) |>
bind_cols(pred_probs, pred_class) |>
mutate(
id = paste(customer_id, account_id),
ship_probability = .pred_TRUE,
predicted_order_shipped = as.integer(.pred_class == "TRUE")
) |>
select(id, ship_probability, predicted_order_shipped)
success_rate <- mean(pred$predicted_order_shipped)
failure_rate <- 1 - success_rate
success_rate
## [1] 0.01348492
failure_rate
## [1] 0.9865151
oj <- fread("open_journeys1.csv")
oj$date <- as.Date(oj$event_timestamp)
oj$dt_tm <- as.POSIXct(
oj$event_timestamp,
format = "%Y-%m-%dT%H:%M:%SZ",
tz = "UTC"
)
oj <- oj |>
distinct(id, dt_tm, event_name, .keep_all = TRUE)
open_journeys <- oj |>
arrange(id, dt_tm) |>
group_by(id) |>
summarise(
event_names = list(event_name),
event_timestamps = list(dt_tm),
max_journey_steps_until_end = max(journey_steps_until_end),
.groups = "drop"
)
open_journeys <- add_rf_features(open_journeys)
pred_probs <- predict(
rf_fit_new,
open_journeys,
type = "prob"
)
submission <- open_journeys |>
bind_cols(pred_probs) |>
transmute(
id,
order_shipped = .pred_TRUE
)
sample_sub <- fread("open_journeys1_flattened_all0.csv")
nrow(submission)
## [1] 158325
nrow(sample_sub)
## [1] 158325
all(submission$id == sample_sub$id)
## [1] TRUE