Carvana

### LOAD DATA
library(tidyverse)

## -- Attaching packages -------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts ----------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(rpart)
library(ggpubr)

## Loading required package: magrittr

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

users_path <- "https://s3-us-west-2.amazonaws.com/carvana-analytics-assignment/users.csv"
searches_path <- "https://s3-us-west-2.amazonaws.com/carvana-analytics-assignment/searches.csv"
vdps_path <- "https://s3-us-west-2.amazonaws.com/carvana-analytics-assignment/vdps.csv"
sales_path <- "https://s3-us-west-2.amazonaws.com/carvana-analytics-assignment/sales.csv"


users <- read.csv(users_path)
searches <- read.csv(searches_path)
vdps <- read.csv(vdps_path)
sales <- read.csv(sales_path)

searches <- searches %>% arrange(user_id)
vdps <- vdps %>% arrange(user_id)
sales <- sales %>% arrange(user_id)


searches$type <- 1
vdps$type <- 2
sales$type <- 3


searches$treatment <- users$treatment[match(searches$user_id,users$user_id)]
searches$region <- users$region[match(searches$user_id,users$user_id)]
vdps$treatment <- users$treatment[match(vdps$user_id,users$user_id)]
vdps$region <- users$region[match(vdps$user_id,users$user_id)]
sales$treatment <- users$treatment[match(sales$user_id,users$user_id)]
sales$region <- users$region[match(sales$user_id,users$user_id)]

## ALL DATA COMBINED

df <- rbind(searches, vdps)

df <- rbind(df, sales)


# df$treatment <- users$treatment[match(df$user_id,users$user_id)]
# df$region <- users$region[match(df$user_id,users$user_id)]


df$type <- as.factor(df$type)

df_agg <- df %>% group_by(user_id) %>% summarize(interactions = n_distinct(event_date_time))

### CHECK TOTAL TREATMENT TO CONTROL UNITS
treatment_count <- df %>%
  group_by(treatment) %>%
  summarise(n_distinct(user_id))

treatment_count <- as.data.frame(treatment_count)
colnames(treatment_count)[colnames(treatment_count)=="n_distinct(user_id)"] <- "Total"
df <- df %>% mutate(sale=ifelse(user_id %in% sales$user_id,1,0))
table(df$treatment, df$sale)

##          
##               0     1
##   Control 38369  6820
##   Test    38452  6649

Chi-Square Test Fails to Reject the Null. Test Treatment Alone Cannot Conclusively Explain Successful Customer Sale.

overall_chi <- chisq.test(df$treatment, df$sale)
overall_chi

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  df$treatment and df$sale
## X-squared = 2.1474, df = 1, p-value = 0.1428

overall_chi_table <- chisq.test(table(df$treatment, df$sale))
overall_chi_table

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(df$treatment, df$sale)
## X-squared = 2.1474, df = 1, p-value = 0.1428

Total Observations of Treatments Test and Control Nearly Even.

p <- ggplot(data=treatment_count, aes(x=treatment, y=Total, fill=treatment)) +
    geom_bar(colour="black", stat="identity") +
    guides(fill=FALSE)

ggplotly(p)

summary(df)

##     user_id                 event_date_time   device_type   
##  Min.   : 1000   2018-06-28 18:44:36:   11   Desktop:52863  
##  1st Qu.: 4280   2018-06-28 18:51:08:   10   Mobile :37427  
##  Median : 7614   2018-06-28 18:53:12:   10                  
##  Mean   : 7504   2018-06-28 18:42:45:    9                  
##  3rd Qu.:10658   2018-06-28 18:48:07:    9                  
##  Max.   :13999   2018-06-28 18:52:59:    9                  
##                  (Other)            :90232                  
##                                  event_id     type        treatment    
##  00001b21-59d4-4e70-9141-a16766a6f290:    1   1:71212   Control:45189  
##  0000657a-f8d9-4ed8-b413-fbcbd0584bb4:    1   2:17738   Test   :45101  
##  0000858a-c861-4372-8c56-3a4a6fda8f00:    1   3: 1340                  
##  0000b07c-28dc-4ff0-a81f-80c07e73b620:    1                            
##  0000b545-28aa-4ba0-9ea1-bf1f602ecd7c:    1                            
##  0001289c-829d-4676-8760-d2aa7463e8d3:    1                            
##  (Other)                             :90284                            
##                region           sale       
##  Midwest          :17744   Min.   :0.0000  
##  New England      :14317   1st Qu.:0.0000  
##  Pacific Northwest:27910   Median :0.0000  
##  Southeast        :14551   Mean   :0.1492  
##  Southwest        :15768   3rd Qu.:0.0000  
##                            Max.   :1.0000  
##

df2 <- df
df2$event_date_time <- as.POSIXct(df2$event_date_time)


# scat <- plot_ly(data = df2, x = ~event_date_time, y = ~type, type = 'scatter',
#   mode = 'markers', symbol = ~treatment, symbols = c('circle','x','o'),
#   color = I('black'), marker = list(size = 10))
# scat

df2_summarise <- df2 %>% group_by(user_id) %>% summarise(visitmax = max(event_date_time), visitmin = min(event_date_time), visit = visitmax - visitmin)

df2_summarise <- merge(df2_summarise, df_agg)
df2_summarise$minutes <- as.numeric(df2_summarise$visit)
df2_summarise$treatment <- users$treatment[match(df2_summarise$user_id,users$user_id)]
df2_summarise$region <- users$region[match(df2_summarise$user_id,users$user_id)]
df2_summarise <- df2_summarise %>% mutate(sale=ifelse(user_id %in% sales$user_id,1,0))

summary(df2_summarise)

##     user_id         visitmax                  
##  Min.   : 1000   Min.   :2018-06-28 00:00:56  
##  1st Qu.: 4250   1st Qu.:2018-06-28 06:13:15  
##  Median : 7500   Median :2018-06-28 12:16:47  
##  Mean   : 7500   Mean   :2018-06-28 12:16:10  
##  3rd Qu.:10749   3rd Qu.:2018-06-28 18:15:22  
##  Max.   :13999   Max.   :2018-06-29 02:03:09  
##     visitmin                      visit           interactions   
##  Min.   :2018-06-28 00:00:00   Length:13000      Min.   :  1.00  
##  1st Qu.:2018-06-28 05:58:02   Class :difftime   1st Qu.:  4.00  
##  Median :2018-06-28 12:01:08   Mode  :numeric    Median :  6.00  
##  Mean   :2018-06-28 11:59:33                     Mean   :  6.84  
##  3rd Qu.:2018-06-28 17:57:40                     3rd Qu.:  9.00  
##  Max.   :2018-06-28 23:59:41                     Max.   :899.00  
##     minutes         treatment                  region          sale       
##  Min.   : 0.000   Control:6565   Midwest          :2533   Min.   :0.0000  
##  1st Qu.: 2.367   Test   :6435   New England      :2584   1st Qu.:0.0000  
##  Median : 4.183                  Pacific Northwest:3941   Median :0.0000  
##  Mean   : 5.191                  Southeast        :1960   Mean   :0.1031  
##  3rd Qu.: 6.150                  Southwest        :1982   3rd Qu.:0.0000  
##  Max.   :59.000                                           Max.   :1.0000

# df2_summarise <- df2_summarise %>% filter(interactions != 899.00)
# df2_summarise <- df2_summarise %>% filter(minutes < 30.000)

Even Distribution of minutes spent on the website between Control group and Test Group.

ggboxplot(df2_summarise, x = "treatment", y = "minutes", 
          color = "treatment", palette = c("#00AFBB", "#E7B800"),
        ylab = "minutes", xlab = "treatment")

Comparison of minutes spent on the website to nonsale observations (0) and successful sale observations (1).

Contrary to what may be assumed, this data suggests individuals who bought a car on the website did so in under 5 minutes regardless of whether they were from the control group or the test group. This may suggest these customers already knew what they wanted and quickly went to their desired destination without shopping around. It would be important to investigate how many customers were returned customers that had spend significant time on the website during a prior search. Regardless of region, great minutes spent on the website does not translate to sales.

plot_ly(df2_summarise, x = ~sale, y = ~minutes, color = ~treatment, type = "box") %>%
  layout(boxmode = "group")

plot_ly(df2_summarise, x = ~sale, y = ~minutes, color = ~region, type = "box") %>%
  layout(boxmode = "group")

Outlier interaction observation causes clouded and deceptive visuals.

plot_ly(df2_summarise, x = ~sale, y = ~interactions, color = ~treatment, type = "box") %>%
  layout(boxmode = "group")

plot_ly(df2_summarise, x = ~sale, y = ~interactions, color = ~region, type = "box") %>%
  layout(boxmode = "group")

Once the interaction outlier is removed a relationship between interactions with the website and successful sales becomes more evident. Possible assumptions from the data: 1.) Customers determined to purchase a car will quickly quick through the website to get to their desired purchase, leisure time is not had. 2.) Purchasing a vehicle involves extra steps and therefore, at a minimum three steps are taken in order to purchase a vehicle.

Search - VDPS - Sale

df2_summarise <- df2_summarise %>% filter(interactions != 899.00)

plot_ly(df2_summarise, x = ~sale, y = ~interactions, color = ~treatment, type = "box") %>%
  layout(boxmode = "group")

plot_ly(df2_summarise, x = ~sale, y = ~interactions, color = ~region, type = "box") %>%
  layout(boxmode = "group")

treatment.ftest <- var.test(interactions ~ treatment, data = df2_summarise)
## FAIL TO REJECT THE NULL. THERE IS A DIFFERENCE IN VARIANCE BETWEEN INTERACTIONS AND TREATMENT

On average the Test treatment led to 0.42 times more interactions by customers than compared with the control group.

interactions <- glm(interactions ~ treatment, data = df2_summarise)
summary(interactions)

## 
## Call:
## glm(formula = interactions ~ treatment, data = df2_summarise)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -5.988  -2.559  -0.559   2.012  15.012  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.55896    0.03984 164.648  < 2e-16 ***
## treatmentTest  0.42939    0.05662   7.584 3.58e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 10.41658)
## 
##     Null deviance: 135983  on 12998  degrees of freedom
## Residual deviance: 135384  on 12997  degrees of freedom
## AIC: 67355
## 
## Number of Fisher Scoring iterations: 2

# inter_lm <- lm(interactions ~ factor(treatment), data = df2_summarise)
# summary(interactions)

# summary(inter_lm)

Additionally, on average the Test group spent 0.43 times more minutes on the page. But this is not necessarily a good thing due to the conclusion investigated previously that less time is associated more with sales than more time on the website.

minutes <- glm(minutes ~ treatment, data = df2_summarise)
summary(minutes)

## 
## Call:
## glm(formula = minutes ~ treatment, data = df2_summarise)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -5.350  -2.800  -1.016   0.966  53.966  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    5.03450    0.07753  64.933  < 2e-16 ***
## treatmentTest  0.31508    0.11020   2.859  0.00425 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 39.45956)
## 
##     Null deviance: 513178  on 12998  degrees of freedom
## Residual deviance: 512856  on 12997  degrees of freedom
## AIC: 84668
## 
## Number of Fisher Scoring iterations: 2

OVERALL DATASET CONCLUSIONS: All of the data is observational with many potential confounding variables so causality cannot be made. Though a greater frequency of interactions and a lower over all time spent on the website are associated with greater sales, the treatment group does not display any evidence that it directly impacts sale rates. Indirectly, the test group may impact sales by boosting interactions with the website higher which may lead to slightly greater sales but that conclusion cannot be made with this data alone.

sale_logit <- glm(sale ~ treatment + interactions + minutes, data = df2_summarise, family = "binomial")
summary(sale_logit)

## 
## Call:
## glm(formula = sale ~ treatment + interactions + minutes, family = "binomial", 
##     data = df2_summarise)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6068  -0.0323  -0.0038  -0.0001   3.8365  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -4.8401     0.2273 -21.294   <2e-16 ***
## treatmentTest   0.1853     0.1963   0.944    0.345    
## interactions    2.8491     0.1383  20.607   <2e-16 ***
## minutes        -5.3710     0.2689 -19.973   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8626.36  on 12998  degrees of freedom
## Residual deviance:  782.76  on 12995  degrees of freedom
## AIC: 790.76
## 
## Number of Fisher Scoring iterations: 11

plot(sale_logit)

# chi <- df2 %>% select(type, treatment)
# table(chi)
# # chi2 <- df2 %>% select(sale, treatment, region)
# 
# 
# chisq <- chisq.test(chi$type, chi$treatment)
# chisq

VDPS

vdps <- vdps %>% arrange(user_id)

vdps_treatment_count <- vdps %>%
  group_by(treatment) %>%
  summarise(n_distinct(user_id))

vdps_treatment_count <- as.data.frame(vdps_treatment_count)
colnames(vdps_treatment_count)[colnames(vdps_treatment_count)=="n_distinct(user_id)"] <- "Total"

print(vdps_treatment_count)

##   treatment Total
## 1   Control  4469
## 2      Test  4209

Total Counts of Test and Control treatment group customers who clicked the vehicle details page.

p <- ggplot(data=vdps_treatment_count, aes(x=treatment, y=Total, fill=treatment)) +
    geom_bar(colour="black", stat="identity") +
    guides(fill=FALSE)

ggplotly(p)

vdps$event_date_time <- as.POSIXct(vdps$event_date_time)


vdps_summarise <- vdps %>% group_by(user_id) %>% summarise(visitmax = max(event_date_time), visitmin = min(event_date_time), visit = visitmax - visitmin)

vdps_agg <- vdps %>% group_by(user_id) %>% summarize(interactions = n_distinct(event_date_time))

vdps_summarise <- merge(vdps_summarise, vdps_agg)

vdps_summarise$minutes <- as.numeric(vdps_summarise$visit)

vdps_summarise$treatment <- users$treatment[match(vdps_summarise$user_id,users$user_id)]
vdps_summarise$region <- users$region[match(vdps_summarise$user_id,users$user_id)]

## HUGE OUTLIER IN SUMMARY REPORT. INTERACTIONS ABOVE 100 MAY NEED TO BE REMOVED. 
summary(vdps_summarise)

##     user_id         visitmax                  
##  Min.   : 1000   Min.   :2018-06-28 00:02:05  
##  1st Qu.: 4193   1st Qu.:2018-06-28 06:01:42  
##  Median : 7450   Median :2018-06-28 12:07:00  
##  Mean   : 7460   Mean   :2018-06-28 12:05:15  
##  3rd Qu.:10750   3rd Qu.:2018-06-28 18:04:51  
##  Max.   :13999   Max.   :2018-06-29 00:03:33  
##     visitmin                      visit           interactions    
##  Min.   :2018-06-28 00:02:05   Length:8678       Min.   :  1.000  
##  1st Qu.:2018-06-28 06:00:52   Class :difftime   1st Qu.:  1.000  
##  Median :2018-06-28 12:05:55   Mode  :numeric    Median :  2.000  
##  Mean   :2018-06-28 12:04:15                     Mean   :  2.001  
##  3rd Qu.:2018-06-28 18:04:06                     3rd Qu.:  2.000  
##  Max.   :2018-06-29 00:01:47                     Max.   :630.000  
##     minutes         treatment                  region    
##  Min.   : 0.000   Control:4469   Midwest          :1413  
##  1st Qu.: 0.000   Test   :4209   New England      :1633  
##  Median : 1.067                  Pacific Northwest:2810  
##  Mean   : 6.006                  Southeast        :1447  
##  3rd Qu.: 2.250                  Southwest        :1375  
##  Max.   :59.000

boxplot(vdps_summarise$interactions)

interaction_outliers <- boxplot(vdps_summarise$interactions, plot=FALSE)$out
vdps_summarise_remove_outliers <- vdps_summarise[-which(vdps_summarise$interactions %in% interaction_outliers),]

plot_ly(vdps_summarise, x = ~treatment, y = ~interactions, color = ~treatment, type = "box") %>%
  layout(boxmode = "group")

plot_ly(vdps_summarise, x = ~treatment, y = ~interactions, color = ~region, type = "box") %>%
  layout(boxmode = "group")

plot_ly(vdps_summarise, x = ~treatment, y = ~minutes, color = ~treatment, type = "box") %>%
  layout(boxmode = "group")

plot_ly(vdps_summarise, x = ~treatment, y = ~minutes, color = ~region, type = "box") %>%
  layout(boxmode = "group")

vdps_summarise_clean <- vdps_summarise %>% filter(interactions != 630.00)

plot_ly(vdps_summarise_clean, x = ~treatment, y = ~interactions, color = ~treatment, type = "box") %>%
  layout(boxmode = "group")

plot_ly(vdps_summarise_clean, x = ~treatment, y = ~interactions, color = ~region, type = "box") %>%
  layout(boxmode = "group")

plot_ly(vdps_summarise_clean, x = ~treatment, y = ~minutes, color = ~treatment, type = "box") %>%
  layout(boxmode = "group")

plot_ly(vdps_summarise_clean, x = ~treatment, y = ~minutes, color = ~region, type = "box") %>%
  layout(boxmode = "group")

searches_region_count <- searches %>%
  group_by(region, treatment) %>%
  summarise(n_distinct(user_id))


vdps_region_count <- vdps %>%
  group_by(region, treatment) %>%
  summarise(n_distinct(user_id))

SALES

The following graphs display a similar flow as the VPDS section. First the outlier is taken out from interactions. Then no signficant differences in data are apparent, though a positive correlation between the Test group and interactions exists which accedes with the displays above.

sales <- sales %>% arrange(user_id)


searches_treatment_count <- searches %>%
  group_by(treatment) %>%
  summarise(n_distinct(user_id))
colnames(searches_treatment_count)[colnames(searches_treatment_count)=="n_distinct(user_id)"] <- "Total"


sales_treatment_count <- sales %>%
  group_by(treatment) %>%
  summarise(n_distinct(user_id))

sales_region_count <- sales %>%
  group_by(region, treatment) %>%
  summarise(n_distinct(user_id))

sales_treatment_count <- as.data.frame(sales_treatment_count)
colnames(sales_treatment_count)[colnames(sales_treatment_count)=="n_distinct(user_id)"] <- "Total"

print(sales_treatment_count)

##   treatment Total
## 1   Control   686
## 2      Test   654

sale_graph <- ggplot(data=sales_treatment_count, aes(x=treatment, y=Total, fill=treatment)) +
    geom_bar(colour="black", stat="identity") +
    guides(fill=FALSE)

ggplotly(sale_graph)

sales$event_date_time <- as.POSIXct(sales$event_date_time)


sales_summarise <- sales %>% group_by(user_id) %>% summarise(visitmax = max(event_date_time), visitmin = min(event_date_time), visit = visitmax - visitmin)

sales_agg <- sales %>% group_by(user_id) %>% summarize(interactions = n_distinct(event_date_time))

sales_summarise <- merge(sales_summarise, sales_agg)

sales_summarise$minutes <- as.numeric(sales_summarise$visit)

sales_summarise$treatment <- users$treatment[match(sales_summarise$user_id,users$user_id)]
sales_summarise$region <- users$region[match(sales_summarise$user_id,users$user_id)]

## HUGE OUTLIER IN SUMMARY REPORT. INTERACTIONS ABOVE 100 REMOVE. 
summary(sales_summarise)

##     user_id         visitmax                  
##  Min.   : 1036   Min.   :2018-06-28 02:02:21  
##  1st Qu.: 3998   1st Qu.:2018-06-28 08:23:43  
##  Median : 7382   Median :2018-06-28 14:27:32  
##  Mean   : 7384   Mean   :2018-06-28 14:19:22  
##  3rd Qu.:10597   3rd Qu.:2018-06-28 20:33:11  
##  Max.   :13993   Max.   :2018-06-29 02:03:09  
##     visitmin                      visit           interactions    minutes 
##  Min.   :2018-06-28 02:02:21   Length:1340       Min.   :1     Min.   :0  
##  1st Qu.:2018-06-28 08:23:43   Class :difftime   1st Qu.:1     1st Qu.:0  
##  Median :2018-06-28 14:27:32   Mode  :numeric    Median :1     Median :0  
##  Mean   :2018-06-28 14:19:22                     Mean   :1     Mean   :0  
##  3rd Qu.:2018-06-28 20:33:11                     3rd Qu.:1     3rd Qu.:0  
##  Max.   :2018-06-29 02:03:09                     Max.   :1     Max.   :0  
##    treatment                 region   
##  Control:686   Midwest          :227  
##  Test   :654   New England      :222  
##                Pacific Northwest:465  
##                Southeast        :204  
##                Southwest        :222  
##

searches_treatment_count$type <- "searches"
vdps_treatment_count$type <- "vdps"
sales_treatment_count$type <- "sales"
Totals <- rbind(searches_treatment_count, vdps_treatment_count)
Totals <- rbind(Totals, sales_treatment_count)
Totals <- Totals %>% arrange(Total)

Totals$type <- factor(Totals$type, levels=c("searches", "vdps", "sales"))
t <- ggplot(data=Totals, aes(x=treatment, y=Total, fill=treatment)) +
    geom_bar(colour="black", stat="identity") +
    guides(fill=FALSE) + facet_wrap(~type)

ggplotly(t)

chi <- chisq.test(Totals$treatment, Totals$type)

CONCLUSIONS:

Not enough is known about the treatment and no causality exists from this dataset to suggest continuing application of the treatment would be strategically beneficial. If the treatment results in a cost of time and money it appears to host a negligible impact on actual car sales. If further study is needed this data suggests applying it to increases in customer interaction could be beneficial, but it is not recommended for permanent deployment.

#### REGION VIEW
searches_region_count$type <- "searches"
vdps_region_count$type <- "vdps"
sales_region_count$type <- "sales"

region_totals <- rbind(searches_region_count, vdps_region_count)
region_totals <- rbind(region_totals, sales_region_count)
region_totals <- as.data.frame(region_totals)
region_totals$type <- factor(region_totals$type, levels=c("sales","vdps", "searches"))
colnames(region_totals)[colnames(region_totals)=="n_distinct(user_id)"] <- "Total"

r <- ggplot(data=region_totals, aes(x=treatment, y=Total, fill=type)) +
    geom_bar(colour="black", stat="identity") +
    guides(fill=FALSE) + facet_wrap(~region)

ggplotly(r)

Carvana

Charles Henderson

VDPS

SALES