library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
auto_dt <- read.csv('C:/Users/Anhuynh/Desktop/Interview & Work Projects/Upstart/Upstart_auto_problem_set_data.csv')
# convert data types
auto_dt$bucket_timestamp <- strptime(auto_dt$bucket_timestamp,'%Y-%m-%d %H:%M:%S')
auto_dt$rate_check_timestamp <- strptime(auto_dt$rate_check_timestamp,'%Y-%m-%d %H:%M:%S')
auto_dt$funded_loan_timestamp <- strptime(auto_dt$funded_loan_timestamp,'%Y-%m-%d %H:%M:%S')
auto_dt$treatment <- as.factor(auto_dt$treatment)
# calculate time difference of rate check and loan funded
auto_dt_ad <- auto_dt %>%
mutate(diff_days_ratecheck_fundedloan = difftime( funded_loan_timestamp, rate_check_timestamp, unit = 'days'))
auto_dt_ad$diff_days_ratecheck_fundedloan <- as.numeric(auto_dt_ad$diff_days_ratecheck_fundedloan)
auto_dt_ad$diff_days_ratecheck_fundedloan <- if_else(is.na(auto_dt_ad$diff_days_ratecheck_fundedloan),0,auto_dt_ad$diff_days_ratecheck_fundedloan)
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
treat.index <- createDataPartition(auto_dt_ad$upstart_application_id, p = .5, list = FALSE)
# Treatment data set
user_treated <- auto_dt_ad[ treat.index,]
# Control data set
user_control <- auto_dt_ad[-treat.index,]
# Run some analysis of the A/B testing outcomes
# Chi-Squared Test
# We want to see if there is a difference between got_rate percentage to see offers from the users who saw "Goal"
# question and those who did not.
# Load ggplot package
library(ggplot2)
# Density plot of got_rate percentage by treatment
ggplot(auto_dt_ad, aes(x = got_rate, color = treatment)) +
geom_density(aes(linetype = treatment)) +
labs(title = "Click-Through Rate to See Offers by Treatment Group", x = "Click-Through Rate",
y = 'Density', color = "Treated", linetype = "Treated") +
theme(plot.title = element_text(hjust = 0.5))
# Run the chi-squared test
ct <- chisq.test(user_treated$got_rate, user_control$got_rate)
ct
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: user_treated$got_rate and user_control$got_rate
## X-squared = 0.00084196, df = 1, p-value = 0.9769
ct$expected
## user_control$got_rate
## user_treated$got_rate 0 1
## 0 1236.855 2181.145
## 1 2069.145 3648.855
ct$observed
## user_control$got_rate
## user_treated$got_rate 0 1
## 0 1238 2180
## 1 2068 3650
# Run some analysis of the A/B testing outcomes
# Chi-Squared Test
# We want to see if there is a difference between rate accepted
# percentage from the users who saw "Goal"
# question and those who did not.
# Load ggplot package
library(ggplot2)
# Density plot of User Acceptance Rate by treatment
ggplot(auto_dt_ad, aes(x = rate_accepted, color = treatment)) +
geom_density(aes(linetype = treatment)) +
labs(title = "Percentage of Rate Accepted by Treatment Group", x = "User Acceptance Rate",
y = 'Density', color = "Treated", linetype = "Treated") +
theme(plot.title = element_text(hjust = 0.5))
# Run the chi-squared test
ct2 <- chisq.test(user_treated$rate_accepted, user_control$rate_accepted)
ct2
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: user_treated$rate_accepted and user_control$rate_accepted
## X-squared = 0.19329, df = 1, p-value = 0.6602
ct2$expected
## user_control$rate_accepted
## user_treated$rate_accepted 0 1
## 0 6734.601 1141.3993
## 1 1077.399 182.6007
ct2$observed
## user_control$rate_accepted
## user_treated$rate_accepted 0 1
## 0 6729 1147
## 1 1083 177
Survival curves are used to examine the effects on the likelihood of remaining in need of a loan application after users check the rate until the loan is funded.
# Library survival
library(survival)
## Warning: package 'survival' was built under R version 4.3.3
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
# Function to calculate survival curves
sur_curve <- function(treated_rd, steps){
# Create output object
survival_mat <- matrix(c(NA), length(steps), 1)
for(i in 1:length(steps)){
# Calculate proportion surviving
survival_mat[i] <-
sum(ifelse(treated_rd >= steps[i], 1, 0))/length(treated_rd)
}
# Return matrix
return(survival_mat)
}
# Round data and calculate steps for survival curve
treated_rd <- round(user_treated$diff_days_ratecheck_fundedloan, 1)
steps <- seq(from = 0, to = max(treated_rd), by = 10)
control_rd <- round(user_control$diff_days_ratecheck_fundedloan, 1)
# Call survival function to plot data
survival_treated <- sur_curve(treated_rd, steps)
survival_control <- sur_curve(control_rd, steps)
# Plot survival curves for treatment and control groups
full_survival <- as.data.frame(cbind(survival_treated, survival_control))
colnames(full_survival) <- c("treated", "control")
full_survival$days <- c(1:3)
ggplot(full_survival) +
geom_line(aes(days, treated, linetype = "1" ), size=0.8) +
geom_line(aes(days, control, linetype = "0"), size=0.5) +
labs(title = "Probability of Survival by Treatment Group",x= "Days",
y = "Proportion", linetype='Treated') +
theme(plot.title = element_text(hjust = 0.5))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Survival analysis, log-rank test
fit <- survdiff(Surv(as.numeric(as.character(auto_dt_ad$diff_days_ratecheck_fundedloan)))
~ auto_dt_ad$treatment)
fit
## Call:
## survdiff(formula = Surv(as.numeric(as.character(auto_dt_ad$diff_days_ratecheck_fundedloan))) ~
## auto_dt_ad$treatment)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## auto_dt_ad$treatment=FALSE 9140 9140 9109 0.102 3.67
## auto_dt_ad$treatment=TRUE 9132 9132 9163 0.102 3.67
##
## Chisq= 3.7 on 1 degrees of freedom, p= 0.06
# Breakdown in the User types
# User Type 01: users chose “Pay less on my monthly bill”
# Round data and calculate steps for survival curve of User Type 01
treated_rd_1 <- round(user_treated$diff_days_ratecheck_fundedloan[which(user_treated$application_motivation == 'lower_monthly')], 1)
steps <- seq(from = 0, to = max(treated_rd), by = 10)
control_rd_1 <- round(user_control$diff_days_ratecheck_fundedloan[which(user_control$application_motivation == 'lower_monthly')], 1)
# Call survival function to plot data
survival_treated_1 <- sur_curve(treated_rd_1, steps)
survival_control_1 <- sur_curve(control_rd_1, steps)
# Plot survival curves for treatment and control groups
full_survival_1 <- as.data.frame(cbind(survival_treated_1, survival_control_1))
colnames(full_survival_1) <- c("treated", "control")
full_survival_1$days <- c(1:3)
ggplot(full_survival_1) +
geom_line(aes(days, treated, linetype = "1" ), size=0.8) +
geom_line(aes(days, control, linetype = "0"), size=0.5) +
labs(title = "Probability of Survival by Treatment Group", subtitle = "User Type 01: Lower Monthly Bill", x= "Days",
y = "Proportion", linetype='Treated') +
theme(plot.title = element_text(hjust = 0.5))
# User Type 02: users chose “Pay less interest overall”
# Round data and calculate steps for survival curve of User Type 02
treated_rd_2 <- round(user_treated$diff_days_ratecheck_fundedloan[which(user_treated$application_motivation == 'lower_interest')], 1)
steps <- seq(from = 0, to = max(treated_rd), by = 10)
control_rd_2 <- round(user_control$diff_days_ratecheck_fundedloan[which(user_control$application_motivation == 'lower_interest')], 1)
# Call survival function to plot data
survival_treated_2 <- sur_curve(treated_rd_2, steps)
survival_control_2 <- sur_curve(control_rd_2, steps)
# Plot survival curves for treatment and control groups
full_survival_2 <- as.data.frame(cbind(survival_treated_2, survival_control_2))
colnames(full_survival_2) <- c("treated", "control")
full_survival_2$days <- c(1:3)
ggplot(full_survival_2) +
geom_line(aes(days, treated, linetype = "1" ), size=0.8) +
geom_line(aes(days, control, linetype = "0"), size=0.5) +
labs(title = "Probability of Survival by Treatment Group", subtitle = "User Type 02: Lower Interest Overall", x= "Days",
y = "Proportion", linetype='Treated') +
theme(plot.title = element_text(hjust = 0.5))
# User Type 03: Users chose “Not Sure”
# Round data and calculate steps for survival curve of User Type 03
treated_rd_3 <- round(user_treated$diff_days_ratecheck_fundedloan[which(user_treated$application_motivation == 'not_sure')], 1)
steps <- seq(from = 0, to = max(treated_rd), by = 10)
control_rd_3 <- round(user_control$diff_days_ratecheck_fundedloan[which(user_control$application_motivation == 'not_sure')], 1)
# Call survival function to plot data
survival_treated_3 <- sur_curve(treated_rd_3, steps)
survival_control_3 <- sur_curve(control_rd_3, steps)
# Plot survival curves for treatment and control groups
full_survival_3 <- as.data.frame(cbind(survival_treated_3, survival_control_3))
colnames(full_survival_3) <- c("treated", "control")
full_survival_3$days <- c(1:3)
ggplot(full_survival_3) +
geom_line(aes(days, treated, linetype = "1" ), size=0.8) +
geom_line(aes(days, control, linetype = "0"), size=0.5) +
labs(title = "Probability of Survival by Treatment Group", subtitle = "User Type 03: Not Sure", x= "Days",
y = "Proportion", linetype='Treated') +
theme(plot.title = element_text(hjust = 0.5))
# Calculate revenue per user
# Monthly interest rate
auto_dt_ad$r <- auto_dt_ad$existing_apr / 12
# remaining_balance
auto_dt_ad$remaining_balance = (auto_dt_ad$existing_monthly_payment * (1 - (1 + auto_dt_ad$r) ** -auto_dt_ad$existing_months_remaining)) / auto_dt_ad$r
# Calculate total funded payments for the loan
auto_dt_ad$total_funded_payments = auto_dt_ad$funded_loan_monthly_payment * auto_dt_ad$funded_loan_contract_months
# Average Revenue per user
auto_dt_ad$revenue_per_user = auto_dt_ad$total_funded_payments - auto_dt_ad$remaining_balance
# Upstart Total Interest Paid of Short/Long Offer
# Function to calculate Upstart Revenue or Total Interest Paid by users given the offers
calculate_revenue_per_user <- function(loan_amount, apr, contract_months, monthly_payment) {
# Total amount paid by the user
total_amount_paid <- monthly_payment * contract_months
# Total interest paid by the user
total_interest_paid <- total_amount_paid - loan_amount
return(total_interest_paid)
}
auto_dt_ad$short_offer_interest_paid <- calculate_revenue_per_user(auto_dt_ad$upstart_short_offer_loan_amount, auto_dt_ad$upstart_short_offer_apr, auto_dt_ad$upstart_short_offer_contract_months, auto_dt_ad$upstart_short_offer_monthly_payment)
auto_dt_ad$long_offer_interest_paid <- calculate_revenue_per_user(auto_dt_ad$upstart_long_offer_loan_amount, auto_dt_ad$upstart_long_offer_apr, auto_dt_ad$upstart_long_offer_contract_months, auto_dt_ad$upstart_long_offer_monthly_payment)
library(caret)
treat.index <- createDataPartition(auto_dt_ad$upstart_application_id, p = .5, list = FALSE)
# Treatment data set
user_treated_2 <- auto_dt_ad[ treat.index,]
# Control data set
user_control_2 <- auto_dt_ad[-treat.index,]
# Average Revenue Per User (ARPU)
# Load ggplot package
library(ggplot2)
# Density plot of ARPU By treatment and control
ggplot(auto_dt_ad, aes(x = revenue_per_user, color = treatment)) +
geom_density(aes(linetype = treatment)) +
labs(title = "Average Revenue Per User by Treatment Group",
x = "Average Revenue Per User", y = "Density",
color = "Treated", linetype = "Treated") +
theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 17750 rows containing non-finite values (`stat_density()`).
# Running a t-test
t.test(user_treated_2$revenue_per_user, user_control_2$revenue_per_user)
##
## Welch Two Sample t-test
##
## data: user_treated_2$revenue_per_user and user_control_2$revenue_per_user
## t = -0.093882, df = 518.38, p-value = 0.9252
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1736.484 1578.088
## sample estimates:
## mean of x mean of y
## 25202.63 25281.83
# Average Revenue Per User (ARPU) By User Types
# Load ggplot package
library(ggplot2)
# Density plot of ARPU By treatment and control
ggplot(auto_dt_ad, aes(x = revenue_per_user, color = application_motivation)) +
geom_density(aes(linetype = application_motivation)) +
labs(title = "Average Revenue Per User by Treatment Group", subtitle = "Beakdown By User Types",
x = "Average Revenue Per User", y = "Density",
color = "Treated", linetype = "Treated") +
theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 17750 rows containing non-finite values (`stat_density()`).
# Running a t-test
t.test(user_treated_2$revenue_per_user, user_control_2$revenue_per_user)
##
## Welch Two Sample t-test
##
## data: user_treated_2$revenue_per_user and user_control_2$revenue_per_user
## t = -0.093882, df = 518.38, p-value = 0.9252
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1736.484 1578.088
## sample estimates:
## mean of x mean of y
## 25202.63 25281.83
# User Type 01: users chose “Pay less on monthly bill”
# Running a t-test
t.test(user_treated_2$revenue_per_user[which(user_treated_2$application_motivation == 'lower_monthly')], user_control_2$revenue_per_user[which(user_treated_2$application_motivation == 'lower_monthly')])
##
## Welch Two Sample t-test
##
## data: user_treated_2$revenue_per_user[which(user_treated_2$application_motivation == "lower_monthly")] and user_control_2$revenue_per_user[which(user_treated_2$application_motivation == "lower_monthly")]
## t = 1.4357, df = 147.24, p-value = 0.1532
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -883.2075 5575.1441
## sample estimates:
## mean of x mean of y
## 26957.12 24611.15
# User Type 02: users chose “Pay less interest overall”
# Running a t-test
t.test(user_treated_2$revenue_per_user[which(user_treated_2$application_motivation == 'lower_interest')], user_control_2$revenue_per_user[which(user_treated_2$application_motivation == 'lower_interest')])
##
## Welch Two Sample t-test
##
## data: user_treated_2$revenue_per_user[which(user_treated_2$application_motivation == "lower_interest")] and user_control_2$revenue_per_user[which(user_treated_2$application_motivation == "lower_interest")]
## t = -0.92521, df = 67.85, p-value = 0.3581
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5814.025 2130.596
## sample estimates:
## mean of x mean of y
## 22258.89 24100.60
# User Type 03: Users chose “Not Sure”
# Running a t-test
t.test(user_treated_2$revenue_per_user[which(user_treated_2$application_motivation == 'not_sure')], user_control_2$revenue_per_user[which(user_treated_2$application_motivation == 'not_sure')])
##
## Welch Two Sample t-test
##
## data: user_treated_2$revenue_per_user[which(user_treated_2$application_motivation == "not_sure")] and user_control_2$revenue_per_user[which(user_treated_2$application_motivation == "not_sure")]
## t = -1.5111, df = 30.305, p-value = 0.1411
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -13658.010 2038.747
## sample estimates:
## mean of x mean of y
## 22826.94 28636.58
# data preparation
auto_dt_cor1 <- auto_dt_ad[, c(2,3,4,8,9,10,11,12,13,14,15,27,31,32,33)]
auto_dt_cor2 <- auto_dt_ad[, c(2,3,4,8,9,10,11,16,17,18,19,20,21,22,23,31)]
library(ggcorrplot) ## apply to multiple data types
model.matrix(~0+., data=auto_dt_cor1) %>%
cor(use="pairwise.complete.obs") %>%
ggcorrplot(show.diag=FALSE, type="lower", lab=TRUE, lab_size=2)
## Warning in cor(., use = "pairwise.complete.obs"): the standard deviation is
## zero
library(ggcorrplot) ## apply to multiple data types
model.matrix(~0+., data=auto_dt_cor2) %>%
cor(use="pairwise.complete.obs") %>%
ggcorrplot(show.diag=FALSE, type="lower", lab=TRUE, lab_size=2)
## Warning in cor(., use = "pairwise.complete.obs"): the standard deviation is
## zero