library(infer)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(timetk)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
hotel_bookings_raw_tbl <- read_csv("../data/hotel_bookings_geo_experiment.csv")
## Rows: 9225 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): assignment
## dbl (6): geo, geo_group, period, treatment, bookings, cost
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(hotel_bookings_raw_tbl)
## # A tibble: 6 × 8
## date geo geo_group period assignment treatment bookings cost
## <date> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 2015-01-05 1 2 0 treatment 0 7327. 0
## 2 2015-01-05 10 1 0 control 0 1927. 0
## 3 2015-01-05 100 1 0 control 0 124. 0
## 4 2015-01-05 11 1 0 control 0 1601. 0
## 5 2015-01-05 12 2 0 treatment 0 1472. 0
## 6 2015-01-05 13 1 0 control 0 1467. 0
PRE_INTERVENTION <- c("2015-01-05", "2015-02-15") %>% as_date()
POST_INTERVENTION <- c("2015-02-16", "2015-03-15") %>% as_date()
bookings_by_assignment_over_time_tbl <- hotel_bookings_raw_tbl %>%
group_by(assignment) %>%
summarize_by_time(
bookings = sum(bookings),
cost = sum(cost),
.by = "day"
) %>%
ungroup()
## .date_var is missing. Using: date
bookings_by_assignment_over_time_tbl %>%
group_by(assignment) %>%
plot_time_series(
date, bookings,
.color_var = assignment,
.interactive = FALSE,
.title = "Adspend Effect"
) +
annotate(
"rect",
xmin = as_date("2015-02-16"),
xmax = as_date("2015-03-15"),
ymin = -Inf,
ymax = Inf,
alpha = 0.2,
fill = "blue"
)
# - We are comparing 2 continuous groups, so use a 2-sided T-test to
# calculate the difference in means between the 2 populations.
# * Split data into pre and experiment ----
pre_intervention_only_tbl <- hotel_bookings_raw_tbl %>%
filter_by_time(.start_date = PRE_INTERVENTION[1], .end_date = PRE_INTERVENTION[2])
## .date_var is missing. Using: date
experiment_only_tbl <- hotel_bookings_raw_tbl %>%
filter_by_time(.start_date = POST_INTERVENTION[1], .end_date = POST_INTERVENTION[2])
## .date_var is missing. Using: date
# * 2-sample t-test ----
diff_in_means_data_tbl <- experiment_only_tbl %>%
select(assignment, bookings)
test_statistic_tbl <- diff_in_means_data_tbl %>%
t_test(
bookings ~ assignment,
order = c("treatment", "control"),
alternative = "two-sided"
)
test_statistic_tbl
## # A tibble: 1 × 7
## statistic t_df p_value alternative estimate lower_ci upper_ci
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 1.92 2703. 0.0545 two.sided 96.2 -1.87 194.
there is an average of $96.20 increase in bookings for treatment group compared to the control.
# * Linear Regression -----
# - If you're doing a 2-sample t-test, this is actually the same thing
# - More importantly, linear regression can help with more complex problems
# that contain multiple regressors
lm(bookings ~ assignment, data = diff_in_means_data_tbl) %>% summary()
##
## Call:
## lm(formula = bookings ~ assignment, data = diff_in_means_data_tbl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -707.0 -578.6 -463.6 -120.2 10261.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 717.55 35.39 20.275 <2e-16 ***
## assignmenttreatment 96.22 50.03 1.923 0.0546 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1320 on 2782 degrees of freedom
## Multiple R-squared: 0.001328, Adjusted R-squared: 0.0009688
## F-statistic: 3.699 on 1 and 2782 DF, p-value: 0.05456
# * Average Treatment Effect (ATE) ----
ate = test_statistic_tbl$estimate
ate
## [1] 96.22111
diff_in_means_data_tbl %>% count(assignment) %>% pull(n) %>% pluck(2)
## [1] 1393
# N * ATE
bookings_increase = 1393 * 96
bookings_increase
## [1] 133728
# N * ATE / COST
ROAS = (1393 * 96) / 50000
ROAS
## [1] 2.67456
We can observe that this is the overall return on ad-spend