Welcome to the Carvana Analytics experiment evaluation assignment!
This project includes fictional data from an outbound call initiative Carvana has been making to users in order to attempt to increase sales. In this exercise you will be asked to describe the results of how the campaign has been performing, explore how you might improve it, and evaluate a model provided by another member of the team.
Individual users may have more than one call or sale attributed to them
#read in data
setwd('C:/Users/email/Desktop/predictive modeling analyst test')
users<-read.csv('CustomerHistory.csv')
calls<-read.csv('OBCallHistory.csv')
sales<-read.csv('SaleHistory.csv')
scores<-read.csv('ModelScores.csv')
1. Does contact rate vary across time of day and/or day of week?
#fix dates
suppressPackageStartupMessages(library(lubridate))
calls$OBCallDateTime<-ymd_hms(calls$OBCallDateTime)
suppressPackageStartupMessages(library(dplyr))
bytime<-calls %>%
group_by(wday(OBCallDateTime,label=TRUE),
factor(cut(hour(OBCallDateTime),breaks=4),
labels=c('Early Morning','Late Morning','Early Afternoon','Late Afternoon'))) %>%
select(Contacted) %>%
summarise(calls=n(),yes_contact=sum(Contacted)) %>%
rename(day=`wday(OBCallDateTime, label = TRUE)`,time=`factor(...)`) %>%
mutate(contact_rate=yes_contact/calls)
Adding missing grouping variables: `wday(OBCallDateTime, label = TRUE)`, `factor(...)`
bytime$id=1:28
#plot contact rate by day and time
library(ggplot2)
ggplot(data=bytime, aes(x=reorder(paste(day,time),-id),y=contact_rate))+geom_bar(stat='identity',fill="#9ecae1")+theme_minimal()+ coord_flip()+geom_hline(yintercept = summarise(calls,calls=n(),yes_contact=sum(Contacted))[[2]]/summarise(calls,calls=n(),yes_contact=sum(Contacted))[[1]],colour="#3182bd")+ylab('Contact Rate')+xlab('Day and Time')
The average contact rate is around 78.5 percent (shown as the vertical line in the plot above). The overall contact rate by day of the week and time of day seems pretty uniform. Thursdays and Saturdays seem to be better for contacting customers compared to other days. Late Mornings (from 8a to noon) on Sunday and early afternoons (from noon to 3p) on Saturday produce the highest contact rates. Nevertheless, there are likely other variables that could explain the variability in contact rates above and beyond the day and time.
2. Does contact rate vary by the customer’s age?
#merge users and calls
suppressPackageStartupMessages(suppressWarnings(library(sqldf)))
getage<-sqldf("select t1.userid
,t1.AgeCategory
,t1.PhoneNumber_hashed
,t2.OBCallDateTime
,t2.Contacted
,t2.advocate_id
,t2.CallTimeSec
from users t1
join calls t2 on (t1.PhoneNumber_hashed=t2.PhoneNumber_hashed)
", stringsAsFactors = FALSE)
#calculate contact rate by age
age<-getage %>%
group_by(AgeCategory) %>%
summarise(calls=n(),yes_contact=sum(Contacted)) %>%
mutate(contact_rate=yes_contact/calls) %>%
select(AgeCategory,contact_rate) %>%
rename(`Contact Rate`=contact_rate)
#plot contact rate by age category
suppressPackageStartupMessages(suppressWarnings(library(googleVis)))
op <- options(gvis.plot.tag='chart')
plot(gvisColumnChart(age,options=list(min=0,title="Contact Rate by Age", width=600,height=400,hAxis="{title:'Age Category'}", legend='none',series="[{color:'#9ecae1'}]")))
Contact rate does not vary by the customer’s age.
3. Does the sales conversion rate differ by Income (group this into $5000 ranges)? Why might this be the case?
#merge sales and users
suppressPackageStartupMessages(suppressWarnings(library(sqldf)))
getinc<-sqldf("select t1.userid
,t1.Income
,t2.SaleDateTime
,t2.StickerPrice
from users t1
left join sales t2 on (t1.userid=t2.userid)
", stringsAsFactors = FALSE)
#create sale indicator
getinc$Sale<-ifelse(is.na(getinc$saledatetime),0,1)
#create income group
suppressPackageStartupMessages(suppressWarnings(library(Hmisc)))
getinc$Income_Group<-cut2(getinc$Income,seq(-10000,120000,5000))
getinc$Income_Group<-recode(getinc$Income_Group,"[-10000, -5000)"="Less than $5K",
"[ -5000, 0)"="Less than $5K",
"[ 0, 5000)"="Less than $5K",
"[ 5000, 10000)"="$5K-<$10K)",
"[ 10000, 15000)"="$10K-<$15K",
"[ 15000, 20000)"="$15K-<$20K",
"[ 20000, 25000)"="$20K-<$25K",
"[ 25000, 30000)"="$25K-<$30K",
"[ 30000, 35000)"="$30K-<$35K",
"[ 35000, 40000)"="$35K-<$40K",
"[ 40000, 45000)"="$40K-<$45K",
"[ 45000, 50000)"="$45K-<$50K",
"[ 50000, 55000)"="$50K-<$55K",
"[ 55000, 60000)"="$55K-<$60K",
"[ 60000, 65000)"="$60K-<$65K",
"[ 65000, 70000)"="$65K-<$70K",
"[ 70000, 75000)"="$70K-<$75K",
"[ 75000, 80000)"="$75K-<$80K",
"[ 80000, 85000)"="$80K-<$85K",
"[ 85000, 90000)"="$85K-<$90K",
"[ 90000, 95000)"="$90K-<$95K",
"[ 95000,100000)"="$95K or more",
"[100000,105000)"="$95K or more",
"[105000,110000)"="$95K or more",
"[110000,115000)"="$95K or more",
"[115000,120000]"="$95K or more")
income<-getinc[,c("Income_Group","Sale")]
prop.table(table(income$Income_Group,income$Sale),2)
0 1
Less than $5K 0.0018750000 0.0005347594
$5K-<$10K) 0.0040625000 0.0019607843
$10K-<$15K 0.0059375000 0.0037433155
$15K-<$20K 0.0132812500 0.0121212121
$20K-<$25K 0.0273437500 0.0226381462
$25K-<$30K 0.0435937500 0.0385026738
$30K-<$35K 0.0576562500 0.0593582888
$35K-<$40K 0.0850000000 0.0825311943
$40K-<$45K 0.0957812500 0.1098039216
$45K-<$50K 0.1195312500 0.1229946524
$50K-<$55K 0.1264062500 0.1319073084
$55K-<$60K 0.1181250000 0.1233511586
$60K-<$65K 0.0970312500 0.1065953654
$65K-<$70K 0.0798437500 0.0764705882
$70K-<$75K 0.0573437500 0.0509803922
$75K-<$80K 0.0325000000 0.0278074866
$80K-<$85K 0.0176562500 0.0135472371
$85K-<$90K 0.0100000000 0.0078431373
$90K-<$95K 0.0043750000 0.0037433155
$95K or more 0.0026562500 0.0035650624
test<-round(chisq.test(table(income$Income_Group,income$Sale))$statistic,2)
df2<-chisq.test(table(income$Income_Group,income$Sale))$parameter
p<-round(chisq.test(table(income$Income_Group,income$Sale))$p.value,2)
chisq.test(table(income$Income_Group,income$Sale))$stdres
0 1
Less than $5K 2.0748177 -2.0748177
$5K-<$10K) 2.0735501 -2.0735501
$10K-<$15K 1.7158840 -1.7158840
$15K-<$20K 0.5655643 -0.5655643
$20K-<$25K 1.6432871 -1.6432871
$25K-<$30K 1.4002899 -1.4002899
$30K-<$35K -0.3966897 0.3966897
$35K-<$40K 0.4870343 -0.4870343
$40K-<$45K -2.5296953 2.5296953
$45K-<$50K -0.5803437 0.5803437
$50K-<$55K -0.8973810 0.8973810
$55K-<$60K -0.8775425 0.8775425
$60K-<$65K -1.7316297 1.7316297
$65K-<$70K 0.6866610 -0.6866610
$70K-<$75K 1.5344080 -1.5344080
$75K-<$80K 1.4966125 -1.4966125
$80K-<$85K 1.8051879 -1.8051879
$85K-<$90K 1.2492334 -1.2492334
$90K-<$95K 0.5418279 -0.5418279
$95K or more -0.8966332 0.8966332
There is a relationship between Income Group and Sales (\(\chi^{2} =\) 37.5 \(, df =\) 19\(, p-value =\) 0.01). Fewer sales are made to those with less than $10,000 of income. This makes sense as cars are a rather expensive purchase. Without added disposable income, there is likely fewer opportunities to make large purchases for with less income compared to higher income users.
4. How does conversion rate vary by CreditScore2 (in 50-point buckets) with regard to contact status (contacted, attempted, not attempted)?
getcredit<-sqldf("select t1.userid
,t1.CreditScore2
,t2.SaleDateTime
,t2.StickerPrice
,t3.Contacted
from users t1
left join calls t3 on (t1.PhoneNumber_hashed=t3.PhoneNumber_hashed)
left join sales t2 on (t1.userid=t2.userid)
", stringsAsFactors = FALSE)
#create sale indicator
getcredit$Sale<-ifelse(is.na(getcredit$saledatetime),0,1)
#create contact status
getcredit$contact_status<-recode(getcredit$Contacted, "0"="Attempted", "1"="Contacted")
getcredit$contact_status<-ifelse(is.na(getcredit$Contacted),"Not Attempted",getcredit$contact_status)
#create credit score group
getcredit$Credit_Group<-cut2(getcredit$CreditScore2,seq(400,900,50))
credit<-getcredit %>%
group_by(Credit_Group, contact_status) %>%
summarise(users=n(),sales=sum(Sale)) %>%
mutate(sales_rate=sales/users) %>%
select(Credit_Group, contact_status,sales_rate)
ggplot(data=credit, aes(x=Credit_Group, y=sales_rate, group=contact_status, col=contact_status))+ geom_line() + geom_point()+xlab('Credit Score') + ylab('Sales Conversion Rate') + theme_minimal() + geom_hline(yintercept=0.3788937,linetype="dashed") +scale_color_manual(values=c('#e41a1c','#377eb8','#4daf4a'),name="Contact Status") +annotate("text", x = 7.5, y = 0.35, label = "Average Sales Conversion Rate (0.38)",size=2) +theme(legend.position="bottom")
On average, being contacted has a higher sales conversion rate compared to those not contacted. In fact, for low credit score customers (<650), being contacted by a sales associate has a large impact on sales conversion. There is virtually no difference between attempted contact and no contact on sales conversion across credit score. For higher credit scores (>650), the sales conversion rate is higher for non contacted users compared to contacted users. These users likely can secure financing and a sale is not contingent on working with an associate’s assistance.
5. Has the sales conversion rate for contacted customers changed over time?
#merge users, calls, and sales
gettime<-sqldf("select t1.userid
,(CASE WHEN t2.SaleDateTime>0 THEN 1 ELSE 0 END) as Sale
,t3.OBCallDateTime
from users t1
left join calls t3 on (t1.PhoneNumber_hashed=t3.PhoneNumber_hashed)
left join sales t2 on (t1.userid=t2.userid)
where t3.Contacted=1", stringsAsFactors = FALSE)
#calclate sales by time
time<-gettime %>%
group_by(isoweek(OBCallDateTime)) %>%
summarise(sales_rate=mean(Sale),startmo=month(min(date(OBCallDateTime)),label=TRUE),startday=day(min(date(OBCallDateTime))),endmo=month(max(date(OBCallDateTime)),label=TRUE),endday=day(max(date(OBCallDateTime)))) %>%
mutate(week=paste0(startmo," ",startday," - ",endmo," ",endday)) %>%
rename(`Sales Conversion Rate`=sales_rate) %>%
select(week, `Sales Conversion Rate`)
#plot it
op <- options(gvis.plot.tag='chart')
Line <- gvisLineChart(time,options=list(min=0,title="Sales Conversion Rate over Time", legend="none", width=950, height=450,series="[{color:'#377eb8'}]",curveType="function"))
plot(Line)
The sales conversion rate has remained remarkably stable across the first 6 months of 2018. The tail weeks show the most deviation, but as short weeks, they are ignored for this discussion. There’s a slight increase in sales conversion rate from January 15th, 2018 to June 3rd, 2018; however, the variation in sales conversion rate is not significant \((t=0.496,p=0.626)\).
6. Design a simple, rule-based strategy for calling users and evaluate it versus the existing call data. Why did you choose this strategy? Do you think it will improve sales? If so, by how much? What proportion of users should we attempt to call?
#merge users and sales
gettree<-sqldf("select t1.*
,(CASE WHEN t2.SaleDateTime>0 THEN 1 ELSE 0 END) as Sale
from users t1
left join sales t2 on (t1.userid=t2.userid)
left join calls t3 on (t1.PhoneNumber_hashed=t3.PhoneNumber_hashed)
where t3.Contacted=1", stringsAsFactors = FALSE)
gettree$PhoneNumber_hashed<-NULL
gettree$InboundCall<-as.factor(gettree$InboundCall)
gettree$IsDealSeeker<-as.factor(gettree$IsDealSeeker)
gettree$IsDirtLover<-as.factor(gettree$IsDirtLover)
gettree$IsDreamer<-as.factor(gettree$IsDreamer)
gettree$Sale<-as.factor(gettree$Sale)
gettree$IsGreen<-as.factor(gettree$IsGreen)
gettree$IsResearcher<-as.factor(gettree$IsResearcher)
gettree$TradeInValueGenerated<-as.factor(gettree$TradeInValueGenerated)
gettree$CalledUs<-as.factor(ifelse(gettree$InboundCallCount>0,1,0))
gettree$InboundCallCount<-NULL
gettree$accountcreationdatetime<-ymd_hms(gettree$accountcreationdatetime)
#fit decision tree
library(caret)
#set up cross validation
train_control <- trainControl(method="repeatedcv", number=10, repeats=3, savePredictions = TRUE)
# train the model
#LDA
lda <- train(as.factor(Sale)~.-userid, data=gettree, trControl=train_control, method="lda",preProcess=c("center", "scale"))
lda$finalModel
Call:
lda(x, grouping = y)
Prior probabilities of groups:
0 1
0.4480726 0.5519274
Group means:
InboundCall1 IsDealSeeker1 IsDirtLover1 IsDreamer1 IsGreen1
0 0.01576389 -1.106241e-04 -0.009027541 0.008003386 -0.01306309
1 -0.01279764 8.980829e-05 0.007328852 -0.006497410 0.01060505
IsResearcher1 TradeInValueGenerated1 TotalInboundCallLength
0 0.001383299 0.0012100819 -0.01995055
1 -0.001123007 -0.0009823839 0.01619650
BureauIncome CreditScore1 CreditScore2 FraudScore Income
0 -0.0295471 0.03102411 0.03802796 -0.03349358 -0.02671323
1 0.0239873 -0.02518638 -0.03087233 0.02719118 0.02168667
MedianVehicleFuelEcon MedianVehiclePrice TradeInValueAmount
0 -0.01312042 -0.009629036 -0.002861395
1 0.01065158 0.007817165 0.002322974
LeadSourceThirdPartyListing LeadSourceWebsite AgeCategory35-45
0 -0.002723548 0.006806087 -0.11657776
1 0.002211065 -0.005525404 0.09464163
AgeCategory45-55 AgeCategory55-65 AgeCategory65+
0 0.03506274 0.01638407 0.01676290
1 -0.02846508 -0.01330112 -0.01360867
accountcreationdatetime MedianVehicleMileage CalledUs1
0 -0.002319858 -0.03763451 -0.03195613
1 0.001883336 0.03055293 0.02594303
Coefficients of linear discriminants:
LD1
InboundCall1 -0.099781239
IsDealSeeker1 -0.003760009
IsDirtLover1 0.081212494
IsDreamer1 -0.038152309
IsGreen1 0.082223896
IsResearcher1 -0.009253282
TradeInValueGenerated1 -0.014137828
TotalInboundCallLength 0.134235521
BureauIncome 0.130112943
CreditScore1 -0.005708944
CreditScore2 -0.252510478
FraudScore 0.219332686
Income 0.117270562
MedianVehicleFuelEcon 0.095124672
MedianVehiclePrice 0.038304264
TradeInValueAmount 0.038198692
LeadSourceThirdPartyListing -0.149622711
LeadSourceWebsite -0.168969967
AgeCategory35-45 0.861163941
AgeCategory45-55 0.068342178
AgeCategory55-65 0.101135847
AgeCategory65+ 0.050153290
accountcreationdatetime 0.003483407
MedianVehicleMileage 0.251031562
CalledUs1 0.215893308
#QDA
qda <- train(as.factor(Sale)~.-userid, data=gettree, trControl=train_control, method="qda",preProcess=c("center", "scale"))
#qda$finalModel
#Decision tree
tree <- train(as.factor(Sale)~.-userid, data=gettree, trControl=train_control, method="rpart",preProcess=c("center", "scale"))
#plot the tree
library(rattle)
fancyRpartPlot(tree$finalModel)
#Accuracy Summary
results <- resamples(list(lda=lda,tree=tree,qda=qda))
summary(results)
Call:
summary.resamples(object = results)
Models: lda, tree, qda
Number of resamples: 30
Accuracy
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
lda 0.5063694 0.5292719 0.5418330 0.5396596 0.5499004 0.5645933 0
tree 0.5509554 0.5686429 0.5725678 0.5724237 0.5792197 0.5901116 0
qda 0.4976077 0.5295056 0.5450203 0.5425249 0.5525478 0.5891720 0
Kappa
Min. 1st Qu. Median Mean 3rd Qu.
lda -0.05017855 -0.0008301595 0.02702024 0.01982715 0.03751205
tree 0.01305186 0.0606883866 0.07006596 0.07212952 0.08593152
qda -0.01672012 0.0514454290 0.08022600 0.07452118 0.09850885
Max. NA's
lda 0.06834053 0
tree 0.11901559 0
qda 0.16464389 0
dotplot(results)
#plot ROC curve
library(pROC)
probsTrain <- predict(qda, gettree, type = "prob")
rocCurve <- roc(response = gettree$Sale,
predictor = probsTrain[, "1"],
levels = rev(levels(gettree$Sale)))
Setting direction: controls > cases
plot(rocCurve, print.thres = "best")
Discriminant analysis and a decision tree was chosen to build a simply rule to predict sales after a call. Linear and quadratic discriminants model the distribution of the predictors separately in each of the response classes (i.e. Sale = “Yes”, Sale = “No” ), and then uses Bayes’ theorem to flip these around into estimates for the probability of the response category given the value of X. The result is a straightforward set of rules that can be easily interpretted. Trees stratify the predictor space into a number of simple regions. The result of which is a straightforward set of rules with which we can classify calls as sales or not.
The table above showcases that the tree has the best accuracy rate at 57 percent. That is, the tree correctly classifys calls as sales or not 57 percent of the time. However, the rule generated is so small that the distribution of probabilities is limited. As such, the quadratic discriminant function was chosen since it had the highest Cohen’s \(\kappa\) value indicating the most agreement between model scores and observed sales.
This simple rule-based strategy for calling users could improve. According to the ROC curve, we could call 40 percent of users and get 60 percent of the sales. Likewise, calling 60 percent of the users would result in 80 percent of sales. This gain in efficiency could be beneficial; however, there’s a lot of room for improvement. A more complex predictive modeling technique could easily outperform these simple methods.
7. A Data Scientist on the team created a model to improve the efficiency of calling. Each user’s score is located in modelScores.csv. Using these scores, compare calling the top 25% of scores versus a strategy where we call 25% of your rule-based strategy from (6). Does the model improve sales? By how much? According to the model, what portion of users should we call?
gettree<-sqldf("select t1.*
,(CASE WHEN t2.SaleDateTime>0 THEN 1 ELSE 0 END) as Sale
from users t1
left join sales t2 on (t1.userid=t2.userid)
left join calls t3 on (t1.PhoneNumber_hashed=t3.PhoneNumber_hashed)
where t3.Contacted=1", stringsAsFactors = FALSE)
gettree$Sale<-as.factor(gettree$Sale)
gettree$accountcreationdatetime<-ymd_hms(gettree$accountcreationdatetime)
gettree$PhoneNumber_hashed<-NULL
gettree$InboundCall<-as.factor(gettree$InboundCall)
gettree$IsDealSeeker<-as.factor(gettree$IsDealSeeker)
gettree$IsDirtLover<-as.factor(gettree$IsDirtLover)
gettree$IsDreamer<-as.factor(gettree$IsDreamer)
gettree$Sale<-as.factor(gettree$Sale)
gettree$IsGreen<-as.factor(gettree$IsGreen)
gettree$IsResearcher<-as.factor(gettree$IsResearcher)
gettree$TradeInValueGenerated<-as.factor(gettree$TradeInValueGenerated)
gettree$CalledUs<-as.factor(ifelse(gettree$InboundCallCount>0,1,0))
gettree$InboundCallCount<-NULL
gettree$accountcreationdatetime<-ymd_hms(gettree$accountcreationdatetime)
#get model scores
mod <- merge(gettree,scores,by="userid")
#plot ROC curve
library(pROC)
rocCurve <- roc(response = mod$Sale,
predictor = mod$modelScore,
levels = rev(levels(mod$Sale)))
Setting direction: controls > cases
plot(rocCurve, print.thres = "best")
#my prediction
mod$mypred<-predict(qda, mod, type = "prob")[,2]
mod$score_grp<-cut2(mod$modelScore,cuts=quantile(mod$modelScore,seq(0,1,0.25)))
mod$myscore_grp<-cut2(mod$mypred,cuts=quantile(mod$mypred,seq(0,1,0.25)))
group=c('Q4','Q3','Q2','Q1')
my_score=prop.table(table(mod$myscore_grp,mod$Sale),2)[,2]
score=prop.table(table(mod$score_grp,mod$Sale),2)[,2]
df<-as.data.frame(cbind(group,score,my_score))
df$score<-as.numeric(as.character(df$score))
df$my_score<-as.numeric(as.character(df$my_score))
names(df)<-c('Group','Original Score','My Score')
#plot comparison
op <- options(gvis.plot.tag='chart')
Column <- gvisColumnChart(df,options=list(min=0,title="Comparing Sales across two models", legend="bottom", width=950, height=450,series="[{color:'#e41a1c'},{color: '#377eb8'}]"))
plot(Column)
Calling 58.7 percent of the users nets 63.3 percent of the sales from this model. If you called the top 25 percent of scores you would capture 26.5 percent of sales. Using the qda model, you’d capture 31.8 percent of sales if you called the top 25 percent. The new rule seems to improve sales efficiency by 5 percent. That is, after calling the same number of people, you could expect to have 5 percent more calls result in sales using the rule from question 6. We could still improve that with a more complex predictor.