TDB Takehome

imps <- read_csv("dsc_imps.csv")

## Parsed with column specification:
## cols(
##   tdb_session_id = col_character(),
##   dt = col_date(format = ""),
##   device = col_character(),
##   ct_vars = col_character()
## )

exp <- read_csv("dsc_exp_ids.csv")

## Parsed with column specification:
## cols(
##   tdb_session_id = col_character(),
##   dt = col_date(format = ""),
##   device = col_character(),
##   exp_id = col_character(),
##   exp_var = col_double()
## )

Exposure Curve

exp %>% 
  filter(exp_id == '4RnrQSacRKKVkTWbGlvLgg', 
         exp_var < 2) %>% 
  count(dt, exp_var) %>% 
  group_by(exp_var) %>% 
  dplyr::mutate(cs = cumsum(n)) %>% 
  ggplot(aes(dt, cs, color = as.factor(exp_var))) + 
  geom_line() + 
  geom_point() + 
  labs(x = "", 
       y = "", 
       title = "DAEM Removal Experiment Cumulative Exposure Curve", 
       subtitle = "Sessions Exposed") + 
  scale_color_discrete(labels=c("Control", "Treatment")) + 
  theme_minimal() + 
  scale_y_continuous(labels = scales::comma) + 
  theme(legend.position = "bottom", 
        legend.title = element_blank(), 
        plot.title.position = "plot")

Doesn’t appear to be SS exposure imbalance (given data set used for plotting)

binom.test(c(19519,19559), 39118, 0.5)

## 
##  Exact binomial test
## 
## data:  c(19519, 19559)
## number of successes = 19519, number of trials = 39078, p-value = 0.8436
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
##  0.4945182 0.5044583
## sample estimates:
## probability of success 
##              0.4994882

Ad Impressions

joined_df <- exp %>% 
  filter(exp_id == '4RnrQSacRKKVkTWbGlvLgg', 
         exp_var < 2) %>% 
    mutate(device = recode(device, "desktop" = "Desktop")) %>% 
  inner_join(imps, by=c("tdb_session_id", "dt", "device"))

Probability Densities

exp %>% 
  filter(exp_id == '4RnrQSacRKKVkTWbGlvLgg', 
         exp_var < 2) %>% 
    mutate(device = recode(device, "desktop" = "Desktop")) %>% 
  inner_join(imps, by=c("tdb_session_id", "dt", "device")) %>% 
  count(tdb_session_id, exp_var, sort = TRUE) %>% 
  ggplot() + 
  geom_density(aes(n, color = as.factor(exp_var))) + 
  labs(x = "", 
       y = "", 
       title = "Probability Density: Ads Seen per Session") + 
  scale_color_discrete(labels=c("Control", "Treatment")) + 
  theme_minimal() + 
  theme(legend.position = "bottom", 
        legend.title = element_blank(), 
        plot.title.position = "plot")

DAEM Eligible Sessions

DAEM impressions aren’t logged directly.

You should assume that 1 DAEM impression is delivered and monetized per user session with the fourth non-DAEM impression on an article page type. That is, if a session featured four or more article page impressions, it also featured a single DAEM impression. Further assume that a DAEM impression generates identical revenue to a non-DAEM ad impression.

joined_df %>% 
  count(exp_var, tdb_session_id)

## # A tibble: 27,394 x 3
##    exp_var tdb_session_id                           n
##      <dbl> <chr>                                <int>
##  1       0 00014924-bb18-4715-b45e-be80f2c805e0    11
##  2       0 00073585-a02f-4390-9c33-05b6b9140179    24
##  3       0 00101c9f-36fa-4ce4-917b-efe3c32610f1    10
##  4       0 0010a121-e620-4e5f-be70-ba9694763300     5
##  5       0 00152cd3-ebd5-4969-9220-3cfba2121c61    14
##  6       0 001db346-9b5d-41dd-b69a-8ff8e98e0f66    21
##  7       0 001f9e38-e628-41ff-92f1-55b68651fca6     3
##  8       0 00277cfb-b00a-4e9a-a15a-91b5a140fc8d     3
##  9       0 00282ede-d548-4c8c-b919-d9533be11a3b     4
## 10       0 0028b967-292a-4984-ab02-1fb8c7c2c907    17
## # … with 27,384 more rows

joined_df %>% 
  group_by(exp_var) %>% 
  summarize(num_sessions = n_distinct(tdb_session_id))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 2 x 2
##   exp_var num_sessions
##     <dbl>        <int>
## 1       0        13718
## 2       1        13676

joined_df %>% 
  mutate(
    ct_vars = str_to_lower(ct_vars), 
    imp_type = case_when(
    grepl("article", ct_vars) ~ 'article', 
    grepl("author", ct_vars) ~ 'author', 
    grepl("home", ct_vars) ~ 'home'
  )) %>% 
  count(tdb_session_id, imp_type, exp_var) %>% 
  filter(n >= 4, 
         imp_type == 'article') %>% 
  group_by(exp_var) %>% 
  summarize(nd = n_distinct(tdb_session_id))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 2 x 2
##   exp_var    nd
##     <dbl> <int>
## 1       0 10305
## 2       1 10417

 #  mutate(daems_seen = round(n / 4))

Time Series

full_ts <- joined_df %>% 
  group_by(dt, tdb_session_id, exp_var) %>% 
  summarize(ad_impression = n(), .groups="drop") %>% 
  group_by(dt, exp_var) %>% 
  summarize(mean = mean(ad_impression), 
           n_sessions = n_distinct(tdb_session_id), 
           sd = sd(ad_impression), 
           me = 1.96 * (sd / sqrt(n_sessions)), 
            .groups="drop")

just_me <- joined_df %>% 
  group_by(dt, tdb_session_id) %>% 
  summarize(ad_impression = n(), .groups = "drop") %>%
  group_by(dt) %>% 
  summarize(mean = mean(ad_impression), 
           n_sessions = n_distinct(tdb_session_id), 
           sd = sd(ad_impression), 
           me = 1.96 * (sd / sqrt(n_sessions)), 
            .groups="drop")


  full_ts %>% 
    select(dt, exp_var, mean) %>% 
  pivot_wider(names_from = exp_var, values_from = mean) %>% 
  mutate(delta_pct = (`1` - `0`)*100. / `0`) %>% 
  inner_join(just_me , by = c("dt")) %>% 
  ggplot() + 
  geom_errorbar(aes(dt, ymin = delta_pct - me, ymax = delta_pct + me)) + 
    geom_point(aes(dt, delta_pct)) + 
  theme_minimal() + 
    labs(x = "", 
         y = "Pct. Delta")

TDB Takehome

Deblina Mukherjee

5/15/2023

Exposure Curve

Ad Impressions

Probability Densities

DAEM Eligible Sessions

Time Series