Data Overview

library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
auto_dt <- read.csv('C:/Users/Anhuynh/Desktop/Interview & Work Projects/Upstart/Upstart_auto_problem_set_data.csv')

Data Engineering

# convert data types
auto_dt$bucket_timestamp <- strptime(auto_dt$bucket_timestamp,'%Y-%m-%d %H:%M:%S')

auto_dt$rate_check_timestamp <- strptime(auto_dt$rate_check_timestamp,'%Y-%m-%d %H:%M:%S')

auto_dt$funded_loan_timestamp <- strptime(auto_dt$funded_loan_timestamp,'%Y-%m-%d %H:%M:%S')

auto_dt$treatment <- as.factor(auto_dt$treatment)

# calculate time difference of rate check and loan funded
auto_dt_ad <- auto_dt %>%
        mutate(diff_days_ratecheck_fundedloan = difftime( funded_loan_timestamp, rate_check_timestamp, unit = 'days'))

auto_dt_ad$diff_days_ratecheck_fundedloan <- as.numeric(auto_dt_ad$diff_days_ratecheck_fundedloan)

auto_dt_ad$diff_days_ratecheck_fundedloan <- if_else(is.na(auto_dt_ad$diff_days_ratecheck_fundedloan),0,auto_dt_ad$diff_days_ratecheck_fundedloan)

Statistical Test

Data Preparation

library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
treat.index <- createDataPartition(auto_dt_ad$upstart_application_id, p = .5, list = FALSE)

# Treatment data set 
user_treated <- auto_dt_ad[ treat.index,]

# Control data set
user_control <- auto_dt_ad[-treat.index,]

1. Acquisition Metrics:

1.1. Click-Through Rate (CTR)

# Run some analysis of the A/B testing outcomes
# Chi-Squared Test
# We want to see if there is a difference between got_rate percentage to see offers from the users who saw "Goal" 
# question and those who did not.

# Load ggplot package
library(ggplot2)

# Density plot of got_rate percentage by treatment
ggplot(auto_dt_ad, aes(x = got_rate, color = treatment)) +
geom_density(aes(linetype = treatment)) +
labs(title = "Click-Through Rate to See Offers by Treatment Group", x = "Click-Through Rate",
y = 'Density', color = "Treated", linetype = "Treated") +
theme(plot.title = element_text(hjust = 0.5))

# Run the chi-squared test
ct <- chisq.test(user_treated$got_rate, user_control$got_rate)
ct
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  user_treated$got_rate and user_control$got_rate
## X-squared = 0.00084196, df = 1, p-value = 0.9769
ct$expected
##                      user_control$got_rate
## user_treated$got_rate        0        1
##                     0 1236.855 2181.145
##                     1 2069.145 3648.855
ct$observed
##                      user_control$got_rate
## user_treated$got_rate    0    1
##                     0 1238 2180
##                     1 2068 3650

1.2. User Acceptance Rate

# Run some analysis of the A/B testing outcomes
# Chi-Squared Test
# We want to see if there is a difference between rate accepted 
# percentage from the users who saw "Goal" 
# question and those who did not.

# Load ggplot package
library(ggplot2)

# Density plot of User Acceptance Rate by treatment
ggplot(auto_dt_ad, aes(x = rate_accepted, color = treatment)) +
geom_density(aes(linetype = treatment)) +
labs(title = "Percentage of Rate Accepted by Treatment Group", x = "User Acceptance Rate",
y = 'Density', color = "Treated", linetype = "Treated") +
theme(plot.title = element_text(hjust = 0.5))

# Run the chi-squared test
ct2 <- chisq.test(user_treated$rate_accepted, user_control$rate_accepted)
ct2
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  user_treated$rate_accepted and user_control$rate_accepted
## X-squared = 0.19329, df = 1, p-value = 0.6602
ct2$expected
##                           user_control$rate_accepted
## user_treated$rate_accepted        0         1
##                          0 6734.601 1141.3993
##                          1 1077.399  182.6007
ct2$observed
##                           user_control$rate_accepted
## user_treated$rate_accepted    0    1
##                          0 6729 1147
##                          1 1083  177

1.3. Survival Rate

Survival curves are used to examine the effects on the likelihood of remaining in need of a loan application after users check the rate until the loan is funded.

# Library survival
library(survival)
## Warning: package 'survival' was built under R version 4.3.3
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
# Function to calculate survival curves
sur_curve <- function(treated_rd, steps){
# Create output object
survival_mat <- matrix(c(NA), length(steps), 1)
for(i in 1:length(steps)){
# Calculate proportion surviving
survival_mat[i] <-
sum(ifelse(treated_rd >= steps[i], 1, 0))/length(treated_rd)
}
# Return matrix
return(survival_mat)
}

# Round data and calculate steps for survival curve
treated_rd <- round(user_treated$diff_days_ratecheck_fundedloan, 1)
steps <- seq(from = 0, to = max(treated_rd), by = 10)
control_rd <- round(user_control$diff_days_ratecheck_fundedloan, 1)
# Call survival function to plot data
survival_treated <- sur_curve(treated_rd, steps)
survival_control <- sur_curve(control_rd, steps)
# Plot survival curves for treatment and control groups
full_survival <- as.data.frame(cbind(survival_treated, survival_control))
colnames(full_survival) <- c("treated", "control")
full_survival$days <- c(1:3)
ggplot(full_survival) +
geom_line(aes(days, treated, linetype = "1" ), size=0.8) +
geom_line(aes(days, control, linetype = "0"), size=0.5) + 
labs(title = "Probability of Survival by Treatment Group",x= "Days",
y = "Proportion", linetype='Treated') +
theme(plot.title = element_text(hjust = 0.5))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Survival analysis, log-rank test
fit <- survdiff(Surv(as.numeric(as.character(auto_dt_ad$diff_days_ratecheck_fundedloan)))
~ auto_dt_ad$treatment)

fit
## Call:
## survdiff(formula = Surv(as.numeric(as.character(auto_dt_ad$diff_days_ratecheck_fundedloan))) ~ 
##     auto_dt_ad$treatment)
## 
##                               N Observed Expected (O-E)^2/E (O-E)^2/V
## auto_dt_ad$treatment=FALSE 9140     9140     9109     0.102      3.67
## auto_dt_ad$treatment=TRUE  9132     9132     9163     0.102      3.67
## 
##  Chisq= 3.7  on 1 degrees of freedom, p= 0.06
# Breakdown in the User types

# User Type 01: users chose “Pay less on my monthly bill”
# Round data and calculate steps for survival curve of User Type 01
treated_rd_1 <- round(user_treated$diff_days_ratecheck_fundedloan[which(user_treated$application_motivation == 'lower_monthly')], 1)
steps <- seq(from = 0, to = max(treated_rd), by = 10)
control_rd_1 <- round(user_control$diff_days_ratecheck_fundedloan[which(user_control$application_motivation == 'lower_monthly')], 1)
# Call survival function to plot data
survival_treated_1 <- sur_curve(treated_rd_1, steps)
survival_control_1 <- sur_curve(control_rd_1, steps)
# Plot survival curves for treatment and control groups
full_survival_1 <- as.data.frame(cbind(survival_treated_1, survival_control_1))
colnames(full_survival_1) <- c("treated", "control")
full_survival_1$days <- c(1:3)
ggplot(full_survival_1) +
geom_line(aes(days, treated, linetype = "1" ), size=0.8) +
geom_line(aes(days, control, linetype = "0"), size=0.5) + 
labs(title = "Probability of Survival by Treatment Group", subtitle = "User Type 01: Lower Monthly Bill", x= "Days",
y = "Proportion", linetype='Treated') +
theme(plot.title = element_text(hjust = 0.5))

# User Type 02: users chose “Pay less interest overall”
# Round data and calculate steps for survival curve of User Type 02
treated_rd_2 <- round(user_treated$diff_days_ratecheck_fundedloan[which(user_treated$application_motivation == 'lower_interest')], 1)
steps <- seq(from = 0, to = max(treated_rd), by = 10)
control_rd_2 <- round(user_control$diff_days_ratecheck_fundedloan[which(user_control$application_motivation == 'lower_interest')], 1)
# Call survival function to plot data
survival_treated_2 <- sur_curve(treated_rd_2, steps)
survival_control_2 <- sur_curve(control_rd_2, steps)
# Plot survival curves for treatment and control groups
full_survival_2 <- as.data.frame(cbind(survival_treated_2, survival_control_2))
colnames(full_survival_2) <- c("treated", "control")
full_survival_2$days <- c(1:3)
ggplot(full_survival_2) +
geom_line(aes(days, treated, linetype = "1" ), size=0.8) +
geom_line(aes(days, control, linetype = "0"), size=0.5) + 
labs(title = "Probability of Survival by Treatment Group", subtitle = "User Type 02: Lower Interest Overall", x= "Days",
y = "Proportion", linetype='Treated') +
theme(plot.title = element_text(hjust = 0.5))

# User Type 03: Users chose “Not Sure”
# Round data and calculate steps for survival curve of User Type 03
treated_rd_3 <- round(user_treated$diff_days_ratecheck_fundedloan[which(user_treated$application_motivation == 'not_sure')], 1)
steps <- seq(from = 0, to = max(treated_rd), by = 10)
control_rd_3 <- round(user_control$diff_days_ratecheck_fundedloan[which(user_control$application_motivation == 'not_sure')], 1)
# Call survival function to plot data
survival_treated_3 <- sur_curve(treated_rd_3, steps)
survival_control_3 <- sur_curve(control_rd_3, steps)
# Plot survival curves for treatment and control groups
full_survival_3 <- as.data.frame(cbind(survival_treated_3, survival_control_3))
colnames(full_survival_3) <- c("treated", "control")
full_survival_3$days <- c(1:3)
ggplot(full_survival_3) +
geom_line(aes(days, treated, linetype = "1" ), size=0.8) +
geom_line(aes(days, control, linetype = "0"), size=0.5) + 
labs(title = "Probability of Survival by Treatment Group", subtitle = "User Type 03: Not Sure", x= "Days",
y = "Proportion", linetype='Treated') +
theme(plot.title = element_text(hjust = 0.5))

1.4. Average Revenue

# Calculate revenue per user 
# Monthly interest rate
auto_dt_ad$r <- auto_dt_ad$existing_apr / 12
# remaining_balance
auto_dt_ad$remaining_balance = (auto_dt_ad$existing_monthly_payment * (1 - (1 + auto_dt_ad$r) ** -auto_dt_ad$existing_months_remaining)) / auto_dt_ad$r

# Calculate total funded payments for the loan
auto_dt_ad$total_funded_payments = auto_dt_ad$funded_loan_monthly_payment * auto_dt_ad$funded_loan_contract_months

# Average Revenue per user
auto_dt_ad$revenue_per_user = auto_dt_ad$total_funded_payments - auto_dt_ad$remaining_balance


# Upstart Total Interest Paid of Short/Long Offer
# Function to calculate Upstart Revenue or Total Interest Paid by users given the offers
calculate_revenue_per_user <- function(loan_amount, apr, contract_months, monthly_payment) {
  # Total amount paid by the user
  total_amount_paid <- monthly_payment * contract_months
  
  # Total interest paid by the user
  total_interest_paid <- total_amount_paid - loan_amount
  
  return(total_interest_paid)
}

auto_dt_ad$short_offer_interest_paid <- calculate_revenue_per_user(auto_dt_ad$upstart_short_offer_loan_amount, auto_dt_ad$upstart_short_offer_apr, auto_dt_ad$upstart_short_offer_contract_months, auto_dt_ad$upstart_short_offer_monthly_payment)

auto_dt_ad$long_offer_interest_paid <- calculate_revenue_per_user(auto_dt_ad$upstart_long_offer_loan_amount, auto_dt_ad$upstart_long_offer_apr, auto_dt_ad$upstart_long_offer_contract_months, auto_dt_ad$upstart_long_offer_monthly_payment)
library(caret)
treat.index <- createDataPartition(auto_dt_ad$upstart_application_id, p = .5, list = FALSE)

# Treatment data set 
user_treated_2 <- auto_dt_ad[ treat.index,]

# Control data set
user_control_2 <- auto_dt_ad[-treat.index,]
# Average Revenue Per User (ARPU)
# Load ggplot package
library(ggplot2)
# Density plot of ARPU By treatment and control
ggplot(auto_dt_ad, aes(x = revenue_per_user, color = treatment)) +
geom_density(aes(linetype = treatment)) +
labs(title = "Average Revenue Per User by Treatment Group",
x = "Average Revenue Per User", y = "Density",
color = "Treated", linetype = "Treated") +
theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 17750 rows containing non-finite values (`stat_density()`).

# Running a t-test
t.test(user_treated_2$revenue_per_user, user_control_2$revenue_per_user)
## 
##  Welch Two Sample t-test
## 
## data:  user_treated_2$revenue_per_user and user_control_2$revenue_per_user
## t = -0.093882, df = 518.38, p-value = 0.9252
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1736.484  1578.088
## sample estimates:
## mean of x mean of y 
##  25202.63  25281.83
# Average Revenue Per User (ARPU) By User Types
# Load ggplot package
library(ggplot2)

# Density plot of ARPU By treatment and control
ggplot(auto_dt_ad, aes(x = revenue_per_user, color = application_motivation)) +
geom_density(aes(linetype = application_motivation)) +
labs(title = "Average Revenue Per User by Treatment Group", subtitle = "Beakdown By User Types",
x = "Average Revenue Per User", y = "Density",
color = "Treated", linetype = "Treated") +
theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 17750 rows containing non-finite values (`stat_density()`).

# Running a t-test
t.test(user_treated_2$revenue_per_user, user_control_2$revenue_per_user)
## 
##  Welch Two Sample t-test
## 
## data:  user_treated_2$revenue_per_user and user_control_2$revenue_per_user
## t = -0.093882, df = 518.38, p-value = 0.9252
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1736.484  1578.088
## sample estimates:
## mean of x mean of y 
##  25202.63  25281.83
# User Type 01: users chose “Pay less on monthly bill”
# Running a t-test
t.test(user_treated_2$revenue_per_user[which(user_treated_2$application_motivation == 'lower_monthly')], user_control_2$revenue_per_user[which(user_treated_2$application_motivation == 'lower_monthly')])
## 
##  Welch Two Sample t-test
## 
## data:  user_treated_2$revenue_per_user[which(user_treated_2$application_motivation == "lower_monthly")] and user_control_2$revenue_per_user[which(user_treated_2$application_motivation == "lower_monthly")]
## t = 1.4357, df = 147.24, p-value = 0.1532
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -883.2075 5575.1441
## sample estimates:
## mean of x mean of y 
##  26957.12  24611.15
# User Type 02: users chose “Pay less interest overall”
# Running a t-test
t.test(user_treated_2$revenue_per_user[which(user_treated_2$application_motivation == 'lower_interest')], user_control_2$revenue_per_user[which(user_treated_2$application_motivation == 'lower_interest')])
## 
##  Welch Two Sample t-test
## 
## data:  user_treated_2$revenue_per_user[which(user_treated_2$application_motivation == "lower_interest")] and user_control_2$revenue_per_user[which(user_treated_2$application_motivation == "lower_interest")]
## t = -0.92521, df = 67.85, p-value = 0.3581
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -5814.025  2130.596
## sample estimates:
## mean of x mean of y 
##  22258.89  24100.60
# User Type 03: Users chose “Not Sure”
# Running a t-test
t.test(user_treated_2$revenue_per_user[which(user_treated_2$application_motivation == 'not_sure')], user_control_2$revenue_per_user[which(user_treated_2$application_motivation == 'not_sure')])
## 
##  Welch Two Sample t-test
## 
## data:  user_treated_2$revenue_per_user[which(user_treated_2$application_motivation == "not_sure")] and user_control_2$revenue_per_user[which(user_treated_2$application_motivation == "not_sure")]
## t = -1.5111, df = 30.305, p-value = 0.1411
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -13658.010   2038.747
## sample estimates:
## mean of x mean of y 
##  22826.94  28636.58

Correlation Matrix

# data preparation

auto_dt_cor1 <- auto_dt_ad[, c(2,3,4,8,9,10,11,12,13,14,15,27,31,32,33)]

auto_dt_cor2 <- auto_dt_ad[, c(2,3,4,8,9,10,11,16,17,18,19,20,21,22,23,31)]

library(ggcorrplot) ## apply to multiple data types
model.matrix(~0+., data=auto_dt_cor1) %>% 
  cor(use="pairwise.complete.obs") %>% 
  ggcorrplot(show.diag=FALSE, type="lower", lab=TRUE, lab_size=2)
## Warning in cor(., use = "pairwise.complete.obs"): the standard deviation is
## zero

library(ggcorrplot) ## apply to multiple data types
model.matrix(~0+., data=auto_dt_cor2) %>% 
  cor(use="pairwise.complete.obs") %>% 
  ggcorrplot(show.diag=FALSE, type="lower", lab=TRUE, lab_size=2)
## Warning in cor(., use = "pairwise.complete.obs"): the standard deviation is
## zero