#Load in the dataset

library(readxl)

OGFile <- read_excel("C:/Users/westi/Downloads/GW email test data Spring 2025 - Project #2 (4).xlsx", sheet = "Data")

str(OGFile)
## tibble [61,219 × 13] (S3: tbl_df/tbl/data.frame)
##  $ RowNum          : num [1:61219] 1 2 3 4 5 6 7 8 9 10 ...
##  $ PropertyCode    : chr [1:61219] "A" "Q" "C" "C" ...
##  $ EMAIL_DATE      : POSIXct[1:61219], format: "2023-12-06" "2023-12-06" ...
##  $ Delivered       : num [1:61219] 1 1 19 29 58 43 8 18 67 40 ...
##  $ UniqueClick     : num [1:61219] 0 0 0 0 0 5 0 0 1 1 ...
##  $ UniqueUnsub     : num [1:61219] 0 0 0 0 0 0 0 0 0 0 ...
##  $ MadeBooking     : num [1:61219] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Email_AB        : chr [1:61219] "MFF" "OBC" "CTL" "CTL" ...
##  $ LCC_Audience    : chr [1:61219] "PGS" "PGS" "PGM" "PGM" ...
##  $ LCC_Recency     : chr [1:61219] "L" "R" "A" "A" ...
##  $ LCC_PST         : chr [1:61219] "T" "S" "P" "P" ...
##  $ Segment         : chr [1:61219] "M" "N" "L" "L" ...
##  $ LCC_Engage_Level: chr [1:61219] "1" "4" "1" "2" ...

#Transform the Dataset

# 1. Calculate CTR and Booking Rate
OGFile$CTR <- OGFile$UniqueClick / OGFile$Delivered
OGFile$Booking_Rate <- OGFile$MadeBooking / OGFile$Delivered

# 2. Create Targeted variable (Yes/No)
OGFile$Targeted <- ifelse(
  (OGFile$Segment == "M" & OGFile$Email_AB == "MFF") |
  (OGFile$Segment == "L" & OGFile$Email_AB == "OBC") |
  (OGFile$Segment == "V" & OGFile$Email_AB == "VSA"),
  "Yes", "No"
)

str(OGFile)
## tibble [61,219 × 16] (S3: tbl_df/tbl/data.frame)
##  $ RowNum          : num [1:61219] 1 2 3 4 5 6 7 8 9 10 ...
##  $ PropertyCode    : chr [1:61219] "A" "Q" "C" "C" ...
##  $ EMAIL_DATE      : POSIXct[1:61219], format: "2023-12-06" "2023-12-06" ...
##  $ Delivered       : num [1:61219] 1 1 19 29 58 43 8 18 67 40 ...
##  $ UniqueClick     : num [1:61219] 0 0 0 0 0 5 0 0 1 1 ...
##  $ UniqueUnsub     : num [1:61219] 0 0 0 0 0 0 0 0 0 0 ...
##  $ MadeBooking     : num [1:61219] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Email_AB        : chr [1:61219] "MFF" "OBC" "CTL" "CTL" ...
##  $ LCC_Audience    : chr [1:61219] "PGS" "PGS" "PGM" "PGM" ...
##  $ LCC_Recency     : chr [1:61219] "L" "R" "A" "A" ...
##  $ LCC_PST         : chr [1:61219] "T" "S" "P" "P" ...
##  $ Segment         : chr [1:61219] "M" "N" "L" "L" ...
##  $ LCC_Engage_Level: chr [1:61219] "1" "4" "1" "2" ...
##  $ CTR             : num [1:61219] 0 0 0 0 0 ...
##  $ Booking_Rate    : num [1:61219] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Targeted        : chr [1:61219] "Yes" "No" "No" "No" ...

Part 1 (Filtering the Data)

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Create OGFile_Part1 with filters:
OGFile_Part1 <- OGFile %>%
  filter(Segment %in% c("L", "M", "V"),     # Keep only segments L, M, V
         Delivered > 0)                    # Filter out rows with Delivered <= 0 or NA

str(OGFile_Part1)
## tibble [40,191 × 16] (S3: tbl_df/tbl/data.frame)
##  $ RowNum          : num [1:40191] 1 3 4 5 6 7 8 9 10 15 ...
##  $ PropertyCode    : chr [1:40191] "A" "C" "C" "C" ...
##  $ EMAIL_DATE      : POSIXct[1:40191], format: "2023-12-06" "2023-12-06" ...
##  $ Delivered       : num [1:40191] 1 19 29 58 43 8 18 67 40 12 ...
##  $ UniqueClick     : num [1:40191] 0 0 0 0 5 0 0 1 1 0 ...
##  $ UniqueUnsub     : num [1:40191] 0 0 0 0 0 0 0 0 0 0 ...
##  $ MadeBooking     : num [1:40191] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Email_AB        : chr [1:40191] "MFF" "CTL" "CTL" "CTL" ...
##  $ LCC_Audience    : chr [1:40191] "PGS" "PGM" "PGM" "PGM" ...
##  $ LCC_Recency     : chr [1:40191] "L" "A" "A" "A" ...
##  $ LCC_PST         : chr [1:40191] "T" "P" "P" "P" ...
##  $ Segment         : chr [1:40191] "M" "L" "L" "L" ...
##  $ LCC_Engage_Level: chr [1:40191] "1" "1" "2" "3" ...
##  $ CTR             : num [1:40191] 0 0 0 0 0.116 ...
##  $ Booking_Rate    : num [1:40191] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Targeted        : chr [1:40191] "Yes" "No" "No" "No" ...
# Convert Segment and Targeted to factors
OGFile_Part1$Segment <- as.factor(OGFile_Part1$Segment)
OGFile_Part1$Targeted <- as.factor(OGFile_Part1$Targeted)

Part One CTR Model

# Load required package
library(glm2)

# Fit the GLM model for CTR (proportion)
CTR_model <- glm(
  CTR ~ Segment * Targeted,       # Two factors with interaction
  family = binomial,              # Binomial distribution for proportions
  data = OGFile_Part1,            # Filtered dataset (segments L, M, V)
  weights = Delivered             # Weight by Delivered
)

# Run ANOVA with Chi-squared test
anova(CTR_model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: CTR
## 
## Terms added sequentially (first to last)
## 
## 
##                  Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                             40190      76040              
## Segment           2  3141.09     40188      72899 < 2.2e-16 ***
## Targeted          1   913.45     40187      71986 < 2.2e-16 ***
## Segment:Targeted  2   209.91     40185      71776 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Part 1 CTR Vizualization

library(ggplot2)

# Weighted CTR for Part 1
agg_ctr_p1 <- OGFile_Part1 %>%
  group_by(Segment, Targeted) %>%
  summarise(mean_ctr = sum(UniqueClick) / sum(Delivered), .groups = "drop")

# Plot CTR
ggplot(agg_ctr_p1, aes(x = Segment, y = mean_ctr, fill = Targeted)) +
  geom_bar(stat = "identity", position = "dodge") +
  ylab("CTR (Proportion)") +
  ggtitle("CTR by Segment and Targeted Status (Part 1)") +
  theme_minimal()

#Part One Booking_Rate Model

# Load required package
library(glm2)

# Fit the GLM model for Booking_Rate
Booking_model <- glm(
  Booking_Rate ~ Segment * Targeted,  # Two factors with interaction
  family = binomial,                  # Binomial distribution for proportions
  data = OGFile_Part1,
  weights = Delivered                 # Weight by Delivered
)

# ANOVA with Chi-squared test
anova(Booking_model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Booking_Rate
## 
## Terms added sequentially (first to last)
## 
## 
##                  Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                             40190      16877              
## Segment           2   475.30     40188      16402 < 2.2e-16 ***
## Targeted          1    13.44     40187      16388 0.0002464 ***
## Segment:Targeted  2     2.93     40185      16385 0.2314262    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Part One Booking_Rate Vizualization

# Weighted Booking Rate for Part 1
agg_booking_p1 <- OGFile_Part1 %>%
  group_by(Segment, Targeted) %>%
  summarise(mean_booking = sum(MadeBooking) / sum(Delivered), .groups = "drop")

# Plot Booking Rate
ggplot(agg_booking_p1, aes(x = Segment, y = mean_booking, fill = Targeted)) +
  geom_bar(stat = "identity", position = "dodge") +
  ylab("Booking Rate (Proportion)") +
  ggtitle("Booking Rate by Segment and Targeted Status (Part 1)") +
  theme_minimal()

Post Hoc Analysis Booking_Rate

#Segment L

# Segment L
L_data <- subset(OGFile_Part1, Segment == "L")

successes_L <- with(L_data, tapply(MadeBooking, Targeted, sum))
totals_L <- with(L_data, tapply(Delivered, Targeted, sum))

prop.test(
  x = c(successes_L["Yes"], successes_L["No"]),
  n = c(totals_L["Yes"], totals_L["No"]),
  alternative = "greater"
)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(successes_L["Yes"], successes_L["No"]) out of c(totals_L["Yes"], totals_L["No"])
## X-squared = 8.7849, df = 1, p-value = 0.001519
## alternative hypothesis: greater
## 95 percent confidence interval:
##  4.740504e-05 1.000000e+00
## sample estimates:
##       prop 1       prop 2 
## 0.0008646468 0.0007523309

#Segment M

# Segment M
M_data <- subset(OGFile_Part1, Segment == "M")

successes_M <- with(M_data, tapply(MadeBooking, Targeted, sum))
totals_M <- with(M_data, tapply(Delivered, Targeted, sum))

prop.test(
  x = c(successes_M["Yes"], successes_M["No"]),
  n = c(totals_M["Yes"], totals_M["No"]),
  alternative = "greater"
)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(successes_M["Yes"], successes_M["No"]) out of c(totals_M["Yes"], totals_M["No"])
## X-squared = 0.06623, df = 1, p-value = 0.3985
## alternative hypothesis: greater
## 95 percent confidence interval:
##  -0.000146887  1.000000000
## sample estimates:
##      prop 1      prop 2 
## 0.001519371 0.001488395

#Segment V

# Segment V
V_data <- subset(OGFile_Part1, Segment == "V")

successes_V <- with(V_data, tapply(MadeBooking, Targeted, sum))
totals_V <- with(V_data, tapply(Delivered, Targeted, sum))

prop.test(
  x = c(successes_V["Yes"], successes_V["No"]),
  n = c(totals_V["Yes"], totals_V["No"]),
  alternative = "greater"
)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(successes_V["Yes"], successes_V["No"]) out of c(totals_V["Yes"], totals_V["No"])
## X-squared = 7.3683, df = 1, p-value = 0.003319
## alternative hypothesis: greater
## 95 percent confidence interval:
##  0.0001126434 1.0000000000
## sample estimates:
##      prop 1      prop 2 
## 0.001786971 0.001497704

#Sample Size Table Part 1

# Summarize Delivered emails by Segment and Targeted
assignment_table <- OGFile_Part1 %>%
  group_by(Segment, Targeted) %>%
  summarise(Emails_Delivered = sum(Delivered), .groups = "drop")

# View the table
print(assignment_table)
## # A tibble: 6 × 3
##   Segment Targeted Emails_Delivered
##   <fct>   <fct>               <dbl>
## 1 L       No                2201159
## 2 L       Yes                736717
## 3 M       No                 268074
## 4 M       Yes                267874
## 5 V       No                 295786
## 6 V       Yes                295472

#Area Chart Part 1

# Load necessary packages
library(dplyr)
library(ggplot2)

# 1. Summarize Delivered by date, segment, and targeted
delivery_summary <- OGFile_Part1 %>%
  group_by(EMAIL_DATE, Segment, Targeted) %>%
  summarise(Daily_Delivered = sum(Delivered), .groups = "drop") %>%
  arrange(EMAIL_DATE)

# 2. Create a grouping variable for coloring (Segment + Targeted)
delivery_summary <- delivery_summary %>%
  mutate(Group = paste(Segment, Targeted, sep = "-"))

# 3. Calculate cumulative sum of Delivered within each Group
delivery_summary <- delivery_summary %>%
  group_by(Group) %>%
  arrange(EMAIL_DATE) %>%
  mutate(Cumulative_Delivered = cumsum(Daily_Delivered)) %>%
  ungroup()

# 4. Plot the area chart
ggplot(delivery_summary, aes(x = EMAIL_DATE, y = Cumulative_Delivered, fill = Group)) +
  geom_area() +
  labs(title = "Cumulative Emails Delivered Over Time by Segment and Targeted Status",
       x = "Email Date",
       y = "Cumulative Emails Delivered") +
  theme_minimal()

##Part 2

#Part 2 (Filtering the Dataset)

# Filter for Segments L & N, Delivered > 0
OGFile_Part2 <- OGFile %>%
  filter(Segment %in% c("L", "N"),
         Delivered > 0)

str(OGFile_Part2)
## tibble [41,590 × 16] (S3: tbl_df/tbl/data.frame)
##  $ RowNum          : num [1:41590] 2 3 4 5 6 11 12 13 14 19 ...
##  $ PropertyCode    : chr [1:41590] "Q" "C" "C" "C" ...
##  $ EMAIL_DATE      : POSIXct[1:41590], format: "2023-12-06" "2023-12-06" ...
##  $ Delivered       : num [1:41590] 1 19 29 58 43 1 7 16 12 20 ...
##  $ UniqueClick     : num [1:41590] 0 0 0 0 5 0 0 0 0 0 ...
##  $ UniqueUnsub     : num [1:41590] 0 0 0 0 0 0 0 0 0 0 ...
##  $ MadeBooking     : num [1:41590] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Email_AB        : chr [1:41590] "OBC" "CTL" "CTL" "CTL" ...
##  $ LCC_Audience    : chr [1:41590] "PGS" "PGM" "PGM" "PGM" ...
##  $ LCC_Recency     : chr [1:41590] "R" "A" "A" "A" ...
##  $ LCC_PST         : chr [1:41590] "S" "P" "P" "P" ...
##  $ Segment         : chr [1:41590] "N" "L" "L" "L" ...
##  $ LCC_Engage_Level: chr [1:41590] "4" "1" "2" "3" ...
##  $ CTR             : num [1:41590] 0 0 0 0 0.116 ...
##  $ Booking_Rate    : num [1:41590] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Targeted        : chr [1:41590] "No" "No" "No" "No" ...
# Convert Segment and Email_AB to factors
OGFile_Part2$Segment <- as.factor(OGFile_Part2$Segment)
OGFile_Part2$Email_AB <- as.factor(OGFile_Part2$Email_AB)

#Part 2 CTR Model

# Load required package
library(glm2)

# Fit the GLM model for CTR (Part 2)
CTR_model_Part2 <- glm(
  CTR ~ Segment * Email_AB,      # Interaction between Segment and Email Version
  family = binomial,             # Binomial for proportions
  data = OGFile_Part2,
  weights = Delivered            # Weight by Delivered
)

# ANOVA with Chi-squared test
anova(CTR_model_Part2, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: CTR
## 
## Terms added sequentially (first to last)
## 
## 
##                  Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                             41589      62695              
## Segment           1   461.36     41588      62234 < 2.2e-16 ***
## Email_AB          3   421.53     41585      61812 < 2.2e-16 ***
## Segment:Email_AB  3   186.87     41582      61625 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Part 2 CTR Vizualization

# Weighted CTR for Part 2
agg_ctr_p2 <- OGFile_Part2 %>%
  group_by(Segment, Email_AB) %>%
  summarise(mean_ctr = sum(UniqueClick) / sum(Delivered), .groups = "drop")

# Plot CTR
ggplot(agg_ctr_p2, aes(x = Segment, y = mean_ctr, fill = Email_AB)) +
  geom_bar(stat = "identity", position = "dodge") +
  ylab("CTR (Proportion)") +
  ggtitle("CTR by Segment and Email Version (Part 2)") +
  theme_minimal()

#Part 2 Booking_Rate Model

# Load required package
library(glm2)

# Fit the GLM model for Booking Rate (Part 2)
Booking_model_Part2 <- glm(
  Booking_Rate ~ Segment * Email_AB,   # Interaction between Segment and Email version
  family = binomial,                   # Binomial for proportions
  data = OGFile_Part2,
  weights = Delivered                  # Weight by Delivered
)

# ANOVA with Chi-squared test
anova(Booking_model_Part2, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Booking_Rate
## 
## Terms added sequentially (first to last)
## 
## 
##                  Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                             41589      13402              
## Segment           1   50.250     41588      13351 1.354e-12 ***
## Email_AB          3    5.846     41585      13346   0.11935    
## Segment:Email_AB  3    8.951     41582      13337   0.02995 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Part 2 Booking_Rate Vizualization

# Weighted Booking Rate for Part 2
agg_booking_p2 <- OGFile_Part2 %>%
  group_by(Segment, Email_AB) %>%
  summarise(mean_booking = sum(MadeBooking) / sum(Delivered), .groups = "drop")

# Plot Booking Rate
ggplot(agg_booking_p2, aes(x = Segment, y = mean_booking, fill = Email_AB)) +
  geom_bar(stat = "identity", position = "dodge") +
  ylab("Booking Rate (Proportion)") +
  ggtitle("Booking Rate by Segment and Email Version (Part 2)") +
  theme_minimal()

#Part 2 Sample Size Table

# Summarize Delivered emails by Segment and Email_AB
assignment_table_Part2 <- OGFile_Part2 %>%
  group_by(Segment, Email_AB) %>%
  summarise(Emails_Delivered = sum(Delivered), .groups = "drop")

# View the table
print(assignment_table_Part2)
## # A tibble: 8 × 3
##   Segment Email_AB Emails_Delivered
##   <fct>   <fct>               <dbl>
## 1 L       CTL                729110
## 2 L       MFF                734156
## 3 L       OBC                736717
## 4 L       VSA                737893
## 5 N       CTL                286787
## 6 N       MFF                288606
## 7 N       OBC                289297
## 8 N       VSA                287562

#Part 2 Area Chart

# Load necessary packages
library(dplyr)
library(ggplot2)
library(tidyr)

# 1. Summarize Delivered by date, segment, and email version
delivery_summary_Part2 <- OGFile_Part2 %>%
  group_by(EMAIL_DATE, Segment, Email_AB) %>%
  summarise(Daily_Delivered = sum(Delivered), .groups = "drop") %>%
  arrange(EMAIL_DATE)

# 2. Create a grouping variable (Segment + Email_AB)
delivery_summary_Part2 <- delivery_summary_Part2 %>%
  mutate(Group = paste(Segment, Email_AB, sep = "-"))

# 3. Calculate cumulative sum within each group
delivery_summary_Part2 <- delivery_summary_Part2 %>%
  group_by(Group) %>%
  arrange(EMAIL_DATE) %>%
  mutate(Cumulative_Delivered = cumsum(Daily_Delivered)) %>%
  ungroup()

# 4. Plot the area chart
ggplot(delivery_summary_Part2, aes(x = EMAIL_DATE, y = Cumulative_Delivered, fill = Group)) +
  geom_area() +
  labs(title = "Cumulative Emails Delivered Over Time by Segment and Email Version",
       x = "Email Date",
       y = "Cumulative Emails Delivered") +
  theme_minimal()