### LOAD DATA
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ----------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(rpart)
library(ggpubr)
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
users_path <- "https://s3-us-west-2.amazonaws.com/carvana-analytics-assignment/users.csv"
searches_path <- "https://s3-us-west-2.amazonaws.com/carvana-analytics-assignment/searches.csv"
vdps_path <- "https://s3-us-west-2.amazonaws.com/carvana-analytics-assignment/vdps.csv"
sales_path <- "https://s3-us-west-2.amazonaws.com/carvana-analytics-assignment/sales.csv"
users <- read.csv(users_path)
searches <- read.csv(searches_path)
vdps <- read.csv(vdps_path)
sales <- read.csv(sales_path)
searches <- searches %>% arrange(user_id)
vdps <- vdps %>% arrange(user_id)
sales <- sales %>% arrange(user_id)
searches$type <- 1
vdps$type <- 2
sales$type <- 3
searches$treatment <- users$treatment[match(searches$user_id,users$user_id)]
searches$region <- users$region[match(searches$user_id,users$user_id)]
vdps$treatment <- users$treatment[match(vdps$user_id,users$user_id)]
vdps$region <- users$region[match(vdps$user_id,users$user_id)]
sales$treatment <- users$treatment[match(sales$user_id,users$user_id)]
sales$region <- users$region[match(sales$user_id,users$user_id)]
## ALL DATA COMBINED
df <- rbind(searches, vdps)
df <- rbind(df, sales)
# df$treatment <- users$treatment[match(df$user_id,users$user_id)]
# df$region <- users$region[match(df$user_id,users$user_id)]
df$type <- as.factor(df$type)
df_agg <- df %>% group_by(user_id) %>% summarize(interactions = n_distinct(event_date_time))
### CHECK TOTAL TREATMENT TO CONTROL UNITS
treatment_count <- df %>%
group_by(treatment) %>%
summarise(n_distinct(user_id))
treatment_count <- as.data.frame(treatment_count)
colnames(treatment_count)[colnames(treatment_count)=="n_distinct(user_id)"] <- "Total"
df <- df %>% mutate(sale=ifelse(user_id %in% sales$user_id,1,0))
table(df$treatment, df$sale)
##
## 0 1
## Control 38369 6820
## Test 38452 6649
Chi-Square Test Fails to Reject the Null. Test Treatment Alone Cannot Conclusively Explain Successful Customer Sale.
overall_chi <- chisq.test(df$treatment, df$sale)
overall_chi
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: df$treatment and df$sale
## X-squared = 2.1474, df = 1, p-value = 0.1428
overall_chi_table <- chisq.test(table(df$treatment, df$sale))
overall_chi_table
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(df$treatment, df$sale)
## X-squared = 2.1474, df = 1, p-value = 0.1428
Total Observations of Treatments Test and Control Nearly Even.
p <- ggplot(data=treatment_count, aes(x=treatment, y=Total, fill=treatment)) +
geom_bar(colour="black", stat="identity") +
guides(fill=FALSE)
ggplotly(p)
summary(df)
## user_id event_date_time device_type
## Min. : 1000 2018-06-28 18:44:36: 11 Desktop:52863
## 1st Qu.: 4280 2018-06-28 18:51:08: 10 Mobile :37427
## Median : 7614 2018-06-28 18:53:12: 10
## Mean : 7504 2018-06-28 18:42:45: 9
## 3rd Qu.:10658 2018-06-28 18:48:07: 9
## Max. :13999 2018-06-28 18:52:59: 9
## (Other) :90232
## event_id type treatment
## 00001b21-59d4-4e70-9141-a16766a6f290: 1 1:71212 Control:45189
## 0000657a-f8d9-4ed8-b413-fbcbd0584bb4: 1 2:17738 Test :45101
## 0000858a-c861-4372-8c56-3a4a6fda8f00: 1 3: 1340
## 0000b07c-28dc-4ff0-a81f-80c07e73b620: 1
## 0000b545-28aa-4ba0-9ea1-bf1f602ecd7c: 1
## 0001289c-829d-4676-8760-d2aa7463e8d3: 1
## (Other) :90284
## region sale
## Midwest :17744 Min. :0.0000
## New England :14317 1st Qu.:0.0000
## Pacific Northwest:27910 Median :0.0000
## Southeast :14551 Mean :0.1492
## Southwest :15768 3rd Qu.:0.0000
## Max. :1.0000
##
df2 <- df
df2$event_date_time <- as.POSIXct(df2$event_date_time)
# scat <- plot_ly(data = df2, x = ~event_date_time, y = ~type, type = 'scatter',
# mode = 'markers', symbol = ~treatment, symbols = c('circle','x','o'),
# color = I('black'), marker = list(size = 10))
# scat
df2_summarise <- df2 %>% group_by(user_id) %>% summarise(visitmax = max(event_date_time), visitmin = min(event_date_time), visit = visitmax - visitmin)
df2_summarise <- merge(df2_summarise, df_agg)
df2_summarise$minutes <- as.numeric(df2_summarise$visit)
df2_summarise$treatment <- users$treatment[match(df2_summarise$user_id,users$user_id)]
df2_summarise$region <- users$region[match(df2_summarise$user_id,users$user_id)]
df2_summarise <- df2_summarise %>% mutate(sale=ifelse(user_id %in% sales$user_id,1,0))
summary(df2_summarise)
## user_id visitmax
## Min. : 1000 Min. :2018-06-28 00:00:56
## 1st Qu.: 4250 1st Qu.:2018-06-28 06:13:15
## Median : 7500 Median :2018-06-28 12:16:47
## Mean : 7500 Mean :2018-06-28 12:16:10
## 3rd Qu.:10749 3rd Qu.:2018-06-28 18:15:22
## Max. :13999 Max. :2018-06-29 02:03:09
## visitmin visit interactions
## Min. :2018-06-28 00:00:00 Length:13000 Min. : 1.00
## 1st Qu.:2018-06-28 05:58:02 Class :difftime 1st Qu.: 4.00
## Median :2018-06-28 12:01:08 Mode :numeric Median : 6.00
## Mean :2018-06-28 11:59:33 Mean : 6.84
## 3rd Qu.:2018-06-28 17:57:40 3rd Qu.: 9.00
## Max. :2018-06-28 23:59:41 Max. :899.00
## minutes treatment region sale
## Min. : 0.000 Control:6565 Midwest :2533 Min. :0.0000
## 1st Qu.: 2.367 Test :6435 New England :2584 1st Qu.:0.0000
## Median : 4.183 Pacific Northwest:3941 Median :0.0000
## Mean : 5.191 Southeast :1960 Mean :0.1031
## 3rd Qu.: 6.150 Southwest :1982 3rd Qu.:0.0000
## Max. :59.000 Max. :1.0000
# df2_summarise <- df2_summarise %>% filter(interactions != 899.00)
# df2_summarise <- df2_summarise %>% filter(minutes < 30.000)
Even Distribution of minutes spent on the website between Control group and Test Group.
ggboxplot(df2_summarise, x = "treatment", y = "minutes",
color = "treatment", palette = c("#00AFBB", "#E7B800"),
ylab = "minutes", xlab = "treatment")
Comparison of minutes spent on the website to nonsale observations (0) and successful sale observations (1).
Contrary to what may be assumed, this data suggests individuals who bought a car on the website did so in under 5 minutes regardless of whether they were from the control group or the test group. This may suggest these customers already knew what they wanted and quickly went to their desired destination without shopping around. It would be important to investigate how many customers were returned customers that had spend significant time on the website during a prior search. Regardless of region, great minutes spent on the website does not translate to sales.
plot_ly(df2_summarise, x = ~sale, y = ~minutes, color = ~treatment, type = "box") %>%
layout(boxmode = "group")
plot_ly(df2_summarise, x = ~sale, y = ~minutes, color = ~region, type = "box") %>%
layout(boxmode = "group")
Outlier interaction observation causes clouded and deceptive visuals.
plot_ly(df2_summarise, x = ~sale, y = ~interactions, color = ~treatment, type = "box") %>%
layout(boxmode = "group")
plot_ly(df2_summarise, x = ~sale, y = ~interactions, color = ~region, type = "box") %>%
layout(boxmode = "group")
Once the interaction outlier is removed a relationship between interactions with the website and successful sales becomes more evident. Possible assumptions from the data: 1.) Customers determined to purchase a car will quickly quick through the website to get to their desired purchase, leisure time is not had. 2.) Purchasing a vehicle involves extra steps and therefore, at a minimum three steps are taken in order to purchase a vehicle.
Search - VDPS - Sale
df2_summarise <- df2_summarise %>% filter(interactions != 899.00)
plot_ly(df2_summarise, x = ~sale, y = ~interactions, color = ~treatment, type = "box") %>%
layout(boxmode = "group")
plot_ly(df2_summarise, x = ~sale, y = ~interactions, color = ~region, type = "box") %>%
layout(boxmode = "group")
treatment.ftest <- var.test(interactions ~ treatment, data = df2_summarise)
## FAIL TO REJECT THE NULL. THERE IS A DIFFERENCE IN VARIANCE BETWEEN INTERACTIONS AND TREATMENT
On average the Test treatment led to 0.42 times more interactions by customers than compared with the control group.
interactions <- glm(interactions ~ treatment, data = df2_summarise)
summary(interactions)
##
## Call:
## glm(formula = interactions ~ treatment, data = df2_summarise)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.988 -2.559 -0.559 2.012 15.012
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.55896 0.03984 164.648 < 2e-16 ***
## treatmentTest 0.42939 0.05662 7.584 3.58e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 10.41658)
##
## Null deviance: 135983 on 12998 degrees of freedom
## Residual deviance: 135384 on 12997 degrees of freedom
## AIC: 67355
##
## Number of Fisher Scoring iterations: 2
# inter_lm <- lm(interactions ~ factor(treatment), data = df2_summarise)
# summary(interactions)
# summary(inter_lm)
Additionally, on average the Test group spent 0.43 times more minutes on the page. But this is not necessarily a good thing due to the conclusion investigated previously that less time is associated more with sales than more time on the website.
minutes <- glm(minutes ~ treatment, data = df2_summarise)
summary(minutes)
##
## Call:
## glm(formula = minutes ~ treatment, data = df2_summarise)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.350 -2.800 -1.016 0.966 53.966
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.03450 0.07753 64.933 < 2e-16 ***
## treatmentTest 0.31508 0.11020 2.859 0.00425 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 39.45956)
##
## Null deviance: 513178 on 12998 degrees of freedom
## Residual deviance: 512856 on 12997 degrees of freedom
## AIC: 84668
##
## Number of Fisher Scoring iterations: 2
OVERALL DATASET CONCLUSIONS: All of the data is observational with many potential confounding variables so causality cannot be made. Though a greater frequency of interactions and a lower over all time spent on the website are associated with greater sales, the treatment group does not display any evidence that it directly impacts sale rates. Indirectly, the test group may impact sales by boosting interactions with the website higher which may lead to slightly greater sales but that conclusion cannot be made with this data alone.
sale_logit <- glm(sale ~ treatment + interactions + minutes, data = df2_summarise, family = "binomial")
summary(sale_logit)
##
## Call:
## glm(formula = sale ~ treatment + interactions + minutes, family = "binomial",
## data = df2_summarise)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6068 -0.0323 -0.0038 -0.0001 3.8365
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.8401 0.2273 -21.294 <2e-16 ***
## treatmentTest 0.1853 0.1963 0.944 0.345
## interactions 2.8491 0.1383 20.607 <2e-16 ***
## minutes -5.3710 0.2689 -19.973 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8626.36 on 12998 degrees of freedom
## Residual deviance: 782.76 on 12995 degrees of freedom
## AIC: 790.76
##
## Number of Fisher Scoring iterations: 11
plot(sale_logit)
# chi <- df2 %>% select(type, treatment)
# table(chi)
# # chi2 <- df2 %>% select(sale, treatment, region)
#
#
# chisq <- chisq.test(chi$type, chi$treatment)
# chisq
vdps <- vdps %>% arrange(user_id)
vdps_treatment_count <- vdps %>%
group_by(treatment) %>%
summarise(n_distinct(user_id))
vdps_treatment_count <- as.data.frame(vdps_treatment_count)
colnames(vdps_treatment_count)[colnames(vdps_treatment_count)=="n_distinct(user_id)"] <- "Total"
print(vdps_treatment_count)
## treatment Total
## 1 Control 4469
## 2 Test 4209
Total Counts of Test and Control treatment group customers who clicked the vehicle details page.
p <- ggplot(data=vdps_treatment_count, aes(x=treatment, y=Total, fill=treatment)) +
geom_bar(colour="black", stat="identity") +
guides(fill=FALSE)
ggplotly(p)
vdps$event_date_time <- as.POSIXct(vdps$event_date_time)
vdps_summarise <- vdps %>% group_by(user_id) %>% summarise(visitmax = max(event_date_time), visitmin = min(event_date_time), visit = visitmax - visitmin)
vdps_agg <- vdps %>% group_by(user_id) %>% summarize(interactions = n_distinct(event_date_time))
vdps_summarise <- merge(vdps_summarise, vdps_agg)
vdps_summarise$minutes <- as.numeric(vdps_summarise$visit)
vdps_summarise$treatment <- users$treatment[match(vdps_summarise$user_id,users$user_id)]
vdps_summarise$region <- users$region[match(vdps_summarise$user_id,users$user_id)]
## HUGE OUTLIER IN SUMMARY REPORT. INTERACTIONS ABOVE 100 MAY NEED TO BE REMOVED.
summary(vdps_summarise)
## user_id visitmax
## Min. : 1000 Min. :2018-06-28 00:02:05
## 1st Qu.: 4193 1st Qu.:2018-06-28 06:01:42
## Median : 7450 Median :2018-06-28 12:07:00
## Mean : 7460 Mean :2018-06-28 12:05:15
## 3rd Qu.:10750 3rd Qu.:2018-06-28 18:04:51
## Max. :13999 Max. :2018-06-29 00:03:33
## visitmin visit interactions
## Min. :2018-06-28 00:02:05 Length:8678 Min. : 1.000
## 1st Qu.:2018-06-28 06:00:52 Class :difftime 1st Qu.: 1.000
## Median :2018-06-28 12:05:55 Mode :numeric Median : 2.000
## Mean :2018-06-28 12:04:15 Mean : 2.001
## 3rd Qu.:2018-06-28 18:04:06 3rd Qu.: 2.000
## Max. :2018-06-29 00:01:47 Max. :630.000
## minutes treatment region
## Min. : 0.000 Control:4469 Midwest :1413
## 1st Qu.: 0.000 Test :4209 New England :1633
## Median : 1.067 Pacific Northwest:2810
## Mean : 6.006 Southeast :1447
## 3rd Qu.: 2.250 Southwest :1375
## Max. :59.000
boxplot(vdps_summarise$interactions)
interaction_outliers <- boxplot(vdps_summarise$interactions, plot=FALSE)$out
vdps_summarise_remove_outliers <- vdps_summarise[-which(vdps_summarise$interactions %in% interaction_outliers),]
plot_ly(vdps_summarise, x = ~treatment, y = ~interactions, color = ~treatment, type = "box") %>%
layout(boxmode = "group")
plot_ly(vdps_summarise, x = ~treatment, y = ~interactions, color = ~region, type = "box") %>%
layout(boxmode = "group")
plot_ly(vdps_summarise, x = ~treatment, y = ~minutes, color = ~treatment, type = "box") %>%
layout(boxmode = "group")
plot_ly(vdps_summarise, x = ~treatment, y = ~minutes, color = ~region, type = "box") %>%
layout(boxmode = "group")
vdps_summarise_clean <- vdps_summarise %>% filter(interactions != 630.00)
plot_ly(vdps_summarise_clean, x = ~treatment, y = ~interactions, color = ~treatment, type = "box") %>%
layout(boxmode = "group")
plot_ly(vdps_summarise_clean, x = ~treatment, y = ~interactions, color = ~region, type = "box") %>%
layout(boxmode = "group")
plot_ly(vdps_summarise_clean, x = ~treatment, y = ~minutes, color = ~treatment, type = "box") %>%
layout(boxmode = "group")
plot_ly(vdps_summarise_clean, x = ~treatment, y = ~minutes, color = ~region, type = "box") %>%
layout(boxmode = "group")
searches_region_count <- searches %>%
group_by(region, treatment) %>%
summarise(n_distinct(user_id))
vdps_region_count <- vdps %>%
group_by(region, treatment) %>%
summarise(n_distinct(user_id))
The following graphs display a similar flow as the VPDS section. First the outlier is taken out from interactions. Then no signficant differences in data are apparent, though a positive correlation between the Test group and interactions exists which accedes with the displays above.
sales <- sales %>% arrange(user_id)
searches_treatment_count <- searches %>%
group_by(treatment) %>%
summarise(n_distinct(user_id))
colnames(searches_treatment_count)[colnames(searches_treatment_count)=="n_distinct(user_id)"] <- "Total"
sales_treatment_count <- sales %>%
group_by(treatment) %>%
summarise(n_distinct(user_id))
sales_region_count <- sales %>%
group_by(region, treatment) %>%
summarise(n_distinct(user_id))
sales_treatment_count <- as.data.frame(sales_treatment_count)
colnames(sales_treatment_count)[colnames(sales_treatment_count)=="n_distinct(user_id)"] <- "Total"
print(sales_treatment_count)
## treatment Total
## 1 Control 686
## 2 Test 654
sale_graph <- ggplot(data=sales_treatment_count, aes(x=treatment, y=Total, fill=treatment)) +
geom_bar(colour="black", stat="identity") +
guides(fill=FALSE)
ggplotly(sale_graph)
sales$event_date_time <- as.POSIXct(sales$event_date_time)
sales_summarise <- sales %>% group_by(user_id) %>% summarise(visitmax = max(event_date_time), visitmin = min(event_date_time), visit = visitmax - visitmin)
sales_agg <- sales %>% group_by(user_id) %>% summarize(interactions = n_distinct(event_date_time))
sales_summarise <- merge(sales_summarise, sales_agg)
sales_summarise$minutes <- as.numeric(sales_summarise$visit)
sales_summarise$treatment <- users$treatment[match(sales_summarise$user_id,users$user_id)]
sales_summarise$region <- users$region[match(sales_summarise$user_id,users$user_id)]
## HUGE OUTLIER IN SUMMARY REPORT. INTERACTIONS ABOVE 100 REMOVE.
summary(sales_summarise)
## user_id visitmax
## Min. : 1036 Min. :2018-06-28 02:02:21
## 1st Qu.: 3998 1st Qu.:2018-06-28 08:23:43
## Median : 7382 Median :2018-06-28 14:27:32
## Mean : 7384 Mean :2018-06-28 14:19:22
## 3rd Qu.:10597 3rd Qu.:2018-06-28 20:33:11
## Max. :13993 Max. :2018-06-29 02:03:09
## visitmin visit interactions minutes
## Min. :2018-06-28 02:02:21 Length:1340 Min. :1 Min. :0
## 1st Qu.:2018-06-28 08:23:43 Class :difftime 1st Qu.:1 1st Qu.:0
## Median :2018-06-28 14:27:32 Mode :numeric Median :1 Median :0
## Mean :2018-06-28 14:19:22 Mean :1 Mean :0
## 3rd Qu.:2018-06-28 20:33:11 3rd Qu.:1 3rd Qu.:0
## Max. :2018-06-29 02:03:09 Max. :1 Max. :0
## treatment region
## Control:686 Midwest :227
## Test :654 New England :222
## Pacific Northwest:465
## Southeast :204
## Southwest :222
##
searches_treatment_count$type <- "searches"
vdps_treatment_count$type <- "vdps"
sales_treatment_count$type <- "sales"
Totals <- rbind(searches_treatment_count, vdps_treatment_count)
Totals <- rbind(Totals, sales_treatment_count)
Totals <- Totals %>% arrange(Total)
Totals$type <- factor(Totals$type, levels=c("searches", "vdps", "sales"))
t <- ggplot(data=Totals, aes(x=treatment, y=Total, fill=treatment)) +
geom_bar(colour="black", stat="identity") +
guides(fill=FALSE) + facet_wrap(~type)
ggplotly(t)
chi <- chisq.test(Totals$treatment, Totals$type)
CONCLUSIONS:
Not enough is known about the treatment and no causality exists from this dataset to suggest continuing application of the treatment would be strategically beneficial. If the treatment results in a cost of time and money it appears to host a negligible impact on actual car sales. If further study is needed this data suggests applying it to increases in customer interaction could be beneficial, but it is not recommended for permanent deployment.
#### REGION VIEW
searches_region_count$type <- "searches"
vdps_region_count$type <- "vdps"
sales_region_count$type <- "sales"
region_totals <- rbind(searches_region_count, vdps_region_count)
region_totals <- rbind(region_totals, sales_region_count)
region_totals <- as.data.frame(region_totals)
region_totals$type <- factor(region_totals$type, levels=c("sales","vdps", "searches"))
colnames(region_totals)[colnames(region_totals)=="n_distinct(user_id)"] <- "Total"
r <- ggplot(data=region_totals, aes(x=treatment, y=Total, fill=type)) +
geom_bar(colour="black", stat="identity") +
guides(fill=FALSE) + facet_wrap(~region)
ggplotly(r)