# ============================================================
#  ΕΡΓΑΣΙΑ 010 — A/B Testing & Causal Inference
#  Dataset: Marketing A/B Testing (Kaggle)
#  Στόχος: Αξιολόγηση αποτελεσματικότητας διαφημιστικής καμπάνιας
# ============================================================

# --- Εγκατάσταση & Φόρτωση πακέτων ---


library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pwr)
library(broom)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
set.seed(42)

# --- Φόρτωση δεδομένων ---
ads <- read_csv("marketing_AB.csv") |>
  janitor::clean_names() |>
  mutate(
    group     = factor(test_group, levels = c("psa", "ad")),
    converted = as.integer(converted)
  )
## New names:
## Rows: 588101 Columns: 7
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): test group, most ads day dbl (4): ...1, user id, total ads, most ads hour
## lgl (1): converted
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
glimpse(ads)
## Rows: 588,101
## Columns: 8
## $ x1            <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
## $ user_id       <dbl> 1069124, 1119715, 1144181, 1435133, 1015700, 1137664, 11…
## $ test_group    <chr> "ad", "ad", "ad", "ad", "ad", "ad", "ad", "ad", "ad", "a…
## $ converted     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ total_ads     <dbl> 130, 93, 21, 355, 276, 734, 264, 17, 21, 142, 209, 47, 6…
## $ most_ads_day  <chr> "Monday", "Tuesday", "Tuesday", "Tuesday", "Friday", "Sa…
## $ most_ads_hour <dbl> 20, 22, 18, 10, 14, 10, 13, 18, 19, 14, 11, 13, 20, 13, …
## $ group         <fct> ad, ad, ad, ad, ad, ad, ad, ad, ad, ad, ad, ad, ad, ad, …
# ============================================================
#  ΜΕΡΟΣ Α — Simulated A/B Test
# ============================================================

# --- Παράμετροι πειράματος ---
n_control   <- 8000      # μέγεθος ομάδας ελέγχου
n_treatment <- 8000      # μέγεθος πειραματικής ομάδας
p_control   <- 0.08      # baseline conversion rate
p_treatment <- 0.10      # μετά την αλλαγή (true effect = +2%)


experiment <- tibble(
  user_id = 1:(n_control + n_treatment),
  group   = c(rep("control", n_control),
              rep("treatment", n_treatment)),
  converted = c(rbinom(n_control,   1, p_control),
              rbinom(n_treatment, 1, p_treatment))
)

summary_stats <- experiment |>
  group_by(group) |>
  summarise(
    n = n(),
    conversions = sum(converted),
    conversion_rate = mean(converted),
    se = sqrt(conversion_rate * (1 - conversion_rate) / n),
    ci_lower = conversion_rate - 1.96 * se,
    ci_upper = conversion_rate + 1.96 * se
  )

summary_stats
## # A tibble: 2 × 7
##   group         n conversions conversion_rate      se ci_lower ci_upper
##   <chr>     <int>       <int>           <dbl>   <dbl>    <dbl>    <dbl>
## 1 control    8000         677          0.0846 0.00311   0.0785   0.0907
## 2 treatment  8000         764          0.0955 0.00329   0.0891   0.102
ggplot(summary_stats, aes(x = group, y = conversion_rate, fill = group)) +
  geom_col() +
  geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2) +
  scale_y_continuous(labels = percent) +
  labs(title = "Simulated Conversion Rate", y = "Conversion Rate")

test_result <- prop.test(
  x = summary_stats$conversions,
  n = summary_stats$n,
  correct = FALSE
)

test_result
## 
##  2-sample test for equality of proportions without continuity correction
## 
## data:  summary_stats$conversions out of summary_stats$n
## X-squared = 5.7725, df = 1, p-value = 0.01628
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.019744875 -0.002005125
## sample estimates:
##   prop 1   prop 2 
## 0.084625 0.095500
# pooled estimate
p_pool <- sum(summary_stats$conversions) / sum(summary_stats$n)

# pooled SE
se_pool <- sqrt(p_pool * (1 - p_pool) * (1/n_control + 1/n_treatment))

# difference
delta <- summary_stats$conversion_rate[2] - summary_stats$conversion_rate[1]

# CI
ci_lower <- delta - 1.96 * se_pool
ci_upper <- delta + 1.96 * se_pool

c(delta = delta, ci_lower = ci_lower, ci_upper = ci_upper)
##       delta    ci_lower    ci_upper 
## 0.010875000 0.002003361 0.019746639
effect_size <- ES.h(p1 = 0.10, p2 = 0.08)

pwr_result <- pwr.2p.test(
  h = effect_size,
  sig.level = 0.05,
  power = 0.8
)

pwr_result
## 
##      Difference of proportion power calculation for binomial distribution (arcsine transformation) 
## 
##               h = 0.069988
##               n = 3204.715
##       sig.level = 0.05
##           power = 0.8
##     alternative = two.sided
## 
## NOTE: same sample sizes
# ============================================================
#  ΜΕΡΟΣ Β — Real-World A/B Analysis
# ============================================================


ads |>
  count(group) |>
  mutate(prop = n / sum(n))
## # A tibble: 2 × 3
##   group      n   prop
##   <fct>  <int>  <dbl>
## 1 psa    23524 0.0400
## 2 ad    564577 0.960
ggplot(ads, aes(x = most_ads_day, fill = group)) +
  geom_bar(position = "dodge") +
  labs(title = "Distribution by Day", x = "Day", y = "Count")

real_summary <- ads |>
  group_by(group) |>
  summarise(
    n = n(),
    conversions = sum(converted),
    conversion_rate = mean(converted),
    se = sqrt(conversion_rate * (1 - conversion_rate) / n),
    ci_lower = conversion_rate - 1.96 * se,
    ci_upper = conversion_rate + 1.96 * se
  )

real_summary
## # A tibble: 2 × 7
##   group      n conversions conversion_rate       se ci_lower ci_upper
##   <fct>  <int>       <int>           <dbl>    <dbl>    <dbl>    <dbl>
## 1 psa    23524         420          0.0179 0.000863   0.0162   0.0195
## 2 ad    564577       14423          0.0255 0.000210   0.0251   0.0260
real_test <- prop.test(
  x = real_summary$conversions,
  n = real_summary$n,
  correct = FALSE
)

tidy(real_test)
## # A tibble: 1 × 9
##   estimate1 estimate2 statistic  p.value parameter conf.low conf.high method    
##       <dbl>     <dbl>     <dbl>    <dbl>     <dbl>    <dbl>     <dbl> <chr>     
## 1    0.0179    0.0255      54.3 1.71e-13         1 -0.00943  -0.00595 2-sample …
## # ℹ 1 more variable: alternative <chr>
daily <- ads |>
  group_by(group, most_ads_day) |>
  summarise(
    conversion_rate = mean(converted),
    n = n(),
    se = sqrt(conversion_rate * (1 - conversion_rate) / n),
    ci_lower = conversion_rate - 1.96 * se,
    ci_upper = conversion_rate + 1.96 * se,
    .groups = "drop"
  )

ggplot(daily, aes(x = most_ads_day, y = conversion_rate, color = group)) +
  geom_line(aes(group = group)) +
  geom_ribbon(aes(ymin = ci_lower, ymax = ci_upper, fill = group),
              alpha = 0.15, color = NA) +
  scale_y_continuous(labels = percent) +
  labs(title = "Conversion Rate by Day", y = "Conversion Rate")

# Κατώφλι επιχειρηματικής σημασίας
d_min <- 0.005

# Absolute lift
delta <- real_summary$conversion_rate[2] - real_summary$conversion_rate[1]

# Relative lift
relative_lift <- delta / real_summary$conversion_rate[1]

# CI για τη διαφορά (χειρωνακτικά)
p_pool <- sum(real_summary$conversions) / sum(real_summary$n)

se_pool <- sqrt(p_pool * (1 - p_pool) *
                (1/real_summary$n[1] + 1/real_summary$n[2]))

ci_lower <- delta - 1.96 * se_pool
ci_upper <- delta + 1.96 * se_pool

delta
## [1] 0.007692453
relative_lift
## [1] 0.4308506
ci_lower
## [1] 0.005646721
ci_upper
## [1] 0.009738186

Αποτελέσματα Absolute lift ≈ 0.007 – 0.008 (≈ 0.7% – 0.8%) Relative lift ≈ ~40%+ 95% CI για τη διαφορά: [ci_lower, ci_upper] > 0 Σύγκριση με κατώφλι δ_min = 0.005 Παρατηρούμε ότι: ΟΛΟ το CI είναι > δ_min Ποια περίπτωση ισχύει; Case A (CI entirely above δ_min) Η καμπάνια είναι: Στατιστικά σημαντική Επιχειρηματικά σημαντική Με ουσιαστικό uplift Tελική σύσταση: Να γίνει rollout της διαφημιστικής καμπάνιας

p-value < 0.001 Άρα απορρίπτουμε την H₀ CI ≠ ίδιο, λόγω διαφορετικών μέθόδων Χρειάζονταν περίπου 3.000 / group, έχουμε περίπου 290.000 άρα έχουμε πολύ υψηλό power