AB test

library(infer)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(timetk)
library(plotly)

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

hotel_bookings_raw_tbl <- read_csv("../data/hotel_bookings_geo_experiment.csv")

## Rows: 9225 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): assignment
## dbl  (6): geo, geo_group, period, treatment, bookings, cost
## date (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(hotel_bookings_raw_tbl)

## # A tibble: 6 × 8
##   date         geo geo_group period assignment treatment bookings  cost
##   <date>     <dbl>     <dbl>  <dbl> <chr>          <dbl>    <dbl> <dbl>
## 1 2015-01-05     1         2      0 treatment          0    7327.     0
## 2 2015-01-05    10         1      0 control            0    1927.     0
## 3 2015-01-05   100         1      0 control            0     124.     0
## 4 2015-01-05    11         1      0 control            0    1601.     0
## 5 2015-01-05    12         2      0 treatment          0    1472.     0
## 6 2015-01-05    13         1      0 control            0    1467.     0

PRE_INTERVENTION  <- c("2015-01-05", "2015-02-15") %>% as_date()
POST_INTERVENTION <- c("2015-02-16", "2015-03-15") %>% as_date()

bookings_by_assignment_over_time_tbl <- hotel_bookings_raw_tbl %>%
    group_by(assignment) %>%
    summarize_by_time(
        bookings = sum(bookings),
        cost  = sum(cost),
        .by   = "day"
    ) %>%
    ungroup()

## .date_var is missing. Using: date

bookings_by_assignment_over_time_tbl %>%
    group_by(assignment) %>%
    plot_time_series(
        date, bookings,
        .color_var = assignment,
        .interactive = FALSE,
        .title = "Adspend Effect"
    ) +
    annotate(
        "rect",
        xmin = as_date("2015-02-16"),
        xmax = as_date("2015-03-15"),
        ymin = -Inf,
        ymax = Inf,
        alpha = 0.2,
        fill  = "blue"
    )

# - We are comparing 2 continuous groups, so use a 2-sided T-test to
#   calculate the difference in means between the 2 populations.

# * Split data into pre and experiment ----

pre_intervention_only_tbl <- hotel_bookings_raw_tbl %>%
    filter_by_time(.start_date = PRE_INTERVENTION[1], .end_date = PRE_INTERVENTION[2])

## .date_var is missing. Using: date

experiment_only_tbl <- hotel_bookings_raw_tbl %>%
    filter_by_time(.start_date = POST_INTERVENTION[1], .end_date = POST_INTERVENTION[2])

## .date_var is missing. Using: date

# * 2-sample t-test ----
diff_in_means_data_tbl <- experiment_only_tbl %>%
    select(assignment, bookings)

test_statistic_tbl <- diff_in_means_data_tbl %>%
    t_test(
        bookings ~ assignment,
        order = c("treatment", "control"),
        alternative = "two-sided"
    )

test_statistic_tbl

## # A tibble: 1 × 7
##   statistic  t_df p_value alternative estimate lower_ci upper_ci
##       <dbl> <dbl>   <dbl> <chr>          <dbl>    <dbl>    <dbl>
## 1      1.92 2703.  0.0545 two.sided       96.2    -1.87     194.

there is an average of $96.20 increase in bookings for treatment group compared to the control.

# * Linear Regression -----
# - If you're doing a 2-sample t-test, this is actually the same thing
# - More importantly, linear regression can help with more complex problems
#   that contain multiple regressors

lm(bookings ~ assignment, data = diff_in_means_data_tbl) %>% summary()

## 
## Call:
## lm(formula = bookings ~ assignment, data = diff_in_means_data_tbl)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -707.0  -578.6  -463.6  -120.2 10261.7 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           717.55      35.39  20.275   <2e-16 ***
## assignmenttreatment    96.22      50.03   1.923   0.0546 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1320 on 2782 degrees of freedom
## Multiple R-squared:  0.001328,   Adjusted R-squared:  0.0009688 
## F-statistic: 3.699 on 1 and 2782 DF,  p-value: 0.05456

# * Average Treatment Effect (ATE) ----

ate = test_statistic_tbl$estimate
ate

## [1] 96.22111

diff_in_means_data_tbl %>% count(assignment) %>% pull(n) %>% pluck(2)

## [1] 1393

# N * ATE
bookings_increase = 1393 * 96
bookings_increase

## [1] 133728

# N * ATE / COST
ROAS = (1393 * 96) / 50000
ROAS

## [1] 2.67456

We can observe that this is the overall return on ad-spend

AB test

Manish

2025-03-21