#Load in the dataset
library(readxl)
OGFile <- read_excel("C:/Users/westi/Downloads/GW email test data Spring 2025 - Project #2 (4).xlsx", sheet = "Data")
str(OGFile)
## tibble [61,219 × 13] (S3: tbl_df/tbl/data.frame)
## $ RowNum : num [1:61219] 1 2 3 4 5 6 7 8 9 10 ...
## $ PropertyCode : chr [1:61219] "A" "Q" "C" "C" ...
## $ EMAIL_DATE : POSIXct[1:61219], format: "2023-12-06" "2023-12-06" ...
## $ Delivered : num [1:61219] 1 1 19 29 58 43 8 18 67 40 ...
## $ UniqueClick : num [1:61219] 0 0 0 0 0 5 0 0 1 1 ...
## $ UniqueUnsub : num [1:61219] 0 0 0 0 0 0 0 0 0 0 ...
## $ MadeBooking : num [1:61219] 0 0 0 0 0 0 0 0 0 0 ...
## $ Email_AB : chr [1:61219] "MFF" "OBC" "CTL" "CTL" ...
## $ LCC_Audience : chr [1:61219] "PGS" "PGS" "PGM" "PGM" ...
## $ LCC_Recency : chr [1:61219] "L" "R" "A" "A" ...
## $ LCC_PST : chr [1:61219] "T" "S" "P" "P" ...
## $ Segment : chr [1:61219] "M" "N" "L" "L" ...
## $ LCC_Engage_Level: chr [1:61219] "1" "4" "1" "2" ...
#Transform the Dataset
# 1. Calculate CTR and Booking Rate
OGFile$CTR <- OGFile$UniqueClick / OGFile$Delivered
OGFile$Booking_Rate <- OGFile$MadeBooking / OGFile$Delivered
# 2. Create Targeted variable (Yes/No)
OGFile$Targeted <- ifelse(
(OGFile$Segment == "M" & OGFile$Email_AB == "MFF") |
(OGFile$Segment == "L" & OGFile$Email_AB == "OBC") |
(OGFile$Segment == "V" & OGFile$Email_AB == "VSA"),
"Yes", "No"
)
str(OGFile)
## tibble [61,219 × 16] (S3: tbl_df/tbl/data.frame)
## $ RowNum : num [1:61219] 1 2 3 4 5 6 7 8 9 10 ...
## $ PropertyCode : chr [1:61219] "A" "Q" "C" "C" ...
## $ EMAIL_DATE : POSIXct[1:61219], format: "2023-12-06" "2023-12-06" ...
## $ Delivered : num [1:61219] 1 1 19 29 58 43 8 18 67 40 ...
## $ UniqueClick : num [1:61219] 0 0 0 0 0 5 0 0 1 1 ...
## $ UniqueUnsub : num [1:61219] 0 0 0 0 0 0 0 0 0 0 ...
## $ MadeBooking : num [1:61219] 0 0 0 0 0 0 0 0 0 0 ...
## $ Email_AB : chr [1:61219] "MFF" "OBC" "CTL" "CTL" ...
## $ LCC_Audience : chr [1:61219] "PGS" "PGS" "PGM" "PGM" ...
## $ LCC_Recency : chr [1:61219] "L" "R" "A" "A" ...
## $ LCC_PST : chr [1:61219] "T" "S" "P" "P" ...
## $ Segment : chr [1:61219] "M" "N" "L" "L" ...
## $ LCC_Engage_Level: chr [1:61219] "1" "4" "1" "2" ...
## $ CTR : num [1:61219] 0 0 0 0 0 ...
## $ Booking_Rate : num [1:61219] 0 0 0 0 0 0 0 0 0 0 ...
## $ Targeted : chr [1:61219] "Yes" "No" "No" "No" ...
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Create OGFile_Part1 with filters:
OGFile_Part1 <- OGFile %>%
filter(Segment %in% c("L", "M", "V"), # Keep only segments L, M, V
Delivered > 0) # Filter out rows with Delivered <= 0 or NA
str(OGFile_Part1)
## tibble [40,191 × 16] (S3: tbl_df/tbl/data.frame)
## $ RowNum : num [1:40191] 1 3 4 5 6 7 8 9 10 15 ...
## $ PropertyCode : chr [1:40191] "A" "C" "C" "C" ...
## $ EMAIL_DATE : POSIXct[1:40191], format: "2023-12-06" "2023-12-06" ...
## $ Delivered : num [1:40191] 1 19 29 58 43 8 18 67 40 12 ...
## $ UniqueClick : num [1:40191] 0 0 0 0 5 0 0 1 1 0 ...
## $ UniqueUnsub : num [1:40191] 0 0 0 0 0 0 0 0 0 0 ...
## $ MadeBooking : num [1:40191] 0 0 0 0 0 0 0 0 0 0 ...
## $ Email_AB : chr [1:40191] "MFF" "CTL" "CTL" "CTL" ...
## $ LCC_Audience : chr [1:40191] "PGS" "PGM" "PGM" "PGM" ...
## $ LCC_Recency : chr [1:40191] "L" "A" "A" "A" ...
## $ LCC_PST : chr [1:40191] "T" "P" "P" "P" ...
## $ Segment : chr [1:40191] "M" "L" "L" "L" ...
## $ LCC_Engage_Level: chr [1:40191] "1" "1" "2" "3" ...
## $ CTR : num [1:40191] 0 0 0 0 0.116 ...
## $ Booking_Rate : num [1:40191] 0 0 0 0 0 0 0 0 0 0 ...
## $ Targeted : chr [1:40191] "Yes" "No" "No" "No" ...
# Convert Segment and Targeted to factors
OGFile_Part1$Segment <- as.factor(OGFile_Part1$Segment)
OGFile_Part1$Targeted <- as.factor(OGFile_Part1$Targeted)
# Load required package
library(glm2)
# Fit the GLM model for CTR (proportion)
CTR_model <- glm(
CTR ~ Segment * Targeted, # Two factors with interaction
family = binomial, # Binomial distribution for proportions
data = OGFile_Part1, # Filtered dataset (segments L, M, V)
weights = Delivered # Weight by Delivered
)
# Run ANOVA with Chi-squared test
anova(CTR_model, test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: CTR
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 40190 76040
## Segment 2 3141.09 40188 72899 < 2.2e-16 ***
## Targeted 1 913.45 40187 71986 < 2.2e-16 ***
## Segment:Targeted 2 209.91 40185 71776 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Part 1 CTR Vizualization
library(ggplot2)
# Weighted CTR for Part 1
agg_ctr_p1 <- OGFile_Part1 %>%
group_by(Segment, Targeted) %>%
summarise(mean_ctr = sum(UniqueClick) / sum(Delivered), .groups = "drop")
# Plot CTR
ggplot(agg_ctr_p1, aes(x = Segment, y = mean_ctr, fill = Targeted)) +
geom_bar(stat = "identity", position = "dodge") +
ylab("CTR (Proportion)") +
ggtitle("CTR by Segment and Targeted Status (Part 1)") +
theme_minimal()
#Part One Booking_Rate Model
# Load required package
library(glm2)
# Fit the GLM model for Booking_Rate
Booking_model <- glm(
Booking_Rate ~ Segment * Targeted, # Two factors with interaction
family = binomial, # Binomial distribution for proportions
data = OGFile_Part1,
weights = Delivered # Weight by Delivered
)
# ANOVA with Chi-squared test
anova(Booking_model, test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Booking_Rate
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 40190 16877
## Segment 2 475.30 40188 16402 < 2.2e-16 ***
## Targeted 1 13.44 40187 16388 0.0002464 ***
## Segment:Targeted 2 2.93 40185 16385 0.2314262
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Part One Booking_Rate Vizualization
# Weighted Booking Rate for Part 1
agg_booking_p1 <- OGFile_Part1 %>%
group_by(Segment, Targeted) %>%
summarise(mean_booking = sum(MadeBooking) / sum(Delivered), .groups = "drop")
# Plot Booking Rate
ggplot(agg_booking_p1, aes(x = Segment, y = mean_booking, fill = Targeted)) +
geom_bar(stat = "identity", position = "dodge") +
ylab("Booking Rate (Proportion)") +
ggtitle("Booking Rate by Segment and Targeted Status (Part 1)") +
theme_minimal()
#Segment L
# Segment L
L_data <- subset(OGFile_Part1, Segment == "L")
successes_L <- with(L_data, tapply(MadeBooking, Targeted, sum))
totals_L <- with(L_data, tapply(Delivered, Targeted, sum))
prop.test(
x = c(successes_L["Yes"], successes_L["No"]),
n = c(totals_L["Yes"], totals_L["No"]),
alternative = "greater"
)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(successes_L["Yes"], successes_L["No"]) out of c(totals_L["Yes"], totals_L["No"])
## X-squared = 8.7849, df = 1, p-value = 0.001519
## alternative hypothesis: greater
## 95 percent confidence interval:
## 4.740504e-05 1.000000e+00
## sample estimates:
## prop 1 prop 2
## 0.0008646468 0.0007523309
#Segment M
# Segment M
M_data <- subset(OGFile_Part1, Segment == "M")
successes_M <- with(M_data, tapply(MadeBooking, Targeted, sum))
totals_M <- with(M_data, tapply(Delivered, Targeted, sum))
prop.test(
x = c(successes_M["Yes"], successes_M["No"]),
n = c(totals_M["Yes"], totals_M["No"]),
alternative = "greater"
)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(successes_M["Yes"], successes_M["No"]) out of c(totals_M["Yes"], totals_M["No"])
## X-squared = 0.06623, df = 1, p-value = 0.3985
## alternative hypothesis: greater
## 95 percent confidence interval:
## -0.000146887 1.000000000
## sample estimates:
## prop 1 prop 2
## 0.001519371 0.001488395
#Segment V
# Segment V
V_data <- subset(OGFile_Part1, Segment == "V")
successes_V <- with(V_data, tapply(MadeBooking, Targeted, sum))
totals_V <- with(V_data, tapply(Delivered, Targeted, sum))
prop.test(
x = c(successes_V["Yes"], successes_V["No"]),
n = c(totals_V["Yes"], totals_V["No"]),
alternative = "greater"
)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(successes_V["Yes"], successes_V["No"]) out of c(totals_V["Yes"], totals_V["No"])
## X-squared = 7.3683, df = 1, p-value = 0.003319
## alternative hypothesis: greater
## 95 percent confidence interval:
## 0.0001126434 1.0000000000
## sample estimates:
## prop 1 prop 2
## 0.001786971 0.001497704
#Sample Size Table Part 1
# Summarize Delivered emails by Segment and Targeted
assignment_table <- OGFile_Part1 %>%
group_by(Segment, Targeted) %>%
summarise(Emails_Delivered = sum(Delivered), .groups = "drop")
# View the table
print(assignment_table)
## # A tibble: 6 × 3
## Segment Targeted Emails_Delivered
## <fct> <fct> <dbl>
## 1 L No 2201159
## 2 L Yes 736717
## 3 M No 268074
## 4 M Yes 267874
## 5 V No 295786
## 6 V Yes 295472
#Area Chart Part 1
# Load necessary packages
library(dplyr)
library(ggplot2)
# 1. Summarize Delivered by date, segment, and targeted
delivery_summary <- OGFile_Part1 %>%
group_by(EMAIL_DATE, Segment, Targeted) %>%
summarise(Daily_Delivered = sum(Delivered), .groups = "drop") %>%
arrange(EMAIL_DATE)
# 2. Create a grouping variable for coloring (Segment + Targeted)
delivery_summary <- delivery_summary %>%
mutate(Group = paste(Segment, Targeted, sep = "-"))
# 3. Calculate cumulative sum of Delivered within each Group
delivery_summary <- delivery_summary %>%
group_by(Group) %>%
arrange(EMAIL_DATE) %>%
mutate(Cumulative_Delivered = cumsum(Daily_Delivered)) %>%
ungroup()
# 4. Plot the area chart
ggplot(delivery_summary, aes(x = EMAIL_DATE, y = Cumulative_Delivered, fill = Group)) +
geom_area() +
labs(title = "Cumulative Emails Delivered Over Time by Segment and Targeted Status",
x = "Email Date",
y = "Cumulative Emails Delivered") +
theme_minimal()
##Part 2
#Part 2 (Filtering the Dataset)
# Filter for Segments L & N, Delivered > 0
OGFile_Part2 <- OGFile %>%
filter(Segment %in% c("L", "N"),
Delivered > 0)
str(OGFile_Part2)
## tibble [41,590 × 16] (S3: tbl_df/tbl/data.frame)
## $ RowNum : num [1:41590] 2 3 4 5 6 11 12 13 14 19 ...
## $ PropertyCode : chr [1:41590] "Q" "C" "C" "C" ...
## $ EMAIL_DATE : POSIXct[1:41590], format: "2023-12-06" "2023-12-06" ...
## $ Delivered : num [1:41590] 1 19 29 58 43 1 7 16 12 20 ...
## $ UniqueClick : num [1:41590] 0 0 0 0 5 0 0 0 0 0 ...
## $ UniqueUnsub : num [1:41590] 0 0 0 0 0 0 0 0 0 0 ...
## $ MadeBooking : num [1:41590] 0 0 0 0 0 0 0 0 0 0 ...
## $ Email_AB : chr [1:41590] "OBC" "CTL" "CTL" "CTL" ...
## $ LCC_Audience : chr [1:41590] "PGS" "PGM" "PGM" "PGM" ...
## $ LCC_Recency : chr [1:41590] "R" "A" "A" "A" ...
## $ LCC_PST : chr [1:41590] "S" "P" "P" "P" ...
## $ Segment : chr [1:41590] "N" "L" "L" "L" ...
## $ LCC_Engage_Level: chr [1:41590] "4" "1" "2" "3" ...
## $ CTR : num [1:41590] 0 0 0 0 0.116 ...
## $ Booking_Rate : num [1:41590] 0 0 0 0 0 0 0 0 0 0 ...
## $ Targeted : chr [1:41590] "No" "No" "No" "No" ...
# Convert Segment and Email_AB to factors
OGFile_Part2$Segment <- as.factor(OGFile_Part2$Segment)
OGFile_Part2$Email_AB <- as.factor(OGFile_Part2$Email_AB)
#Part 2 CTR Model
# Load required package
library(glm2)
# Fit the GLM model for CTR (Part 2)
CTR_model_Part2 <- glm(
CTR ~ Segment * Email_AB, # Interaction between Segment and Email Version
family = binomial, # Binomial for proportions
data = OGFile_Part2,
weights = Delivered # Weight by Delivered
)
# ANOVA with Chi-squared test
anova(CTR_model_Part2, test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: CTR
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 41589 62695
## Segment 1 461.36 41588 62234 < 2.2e-16 ***
## Email_AB 3 421.53 41585 61812 < 2.2e-16 ***
## Segment:Email_AB 3 186.87 41582 61625 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Part 2 CTR Vizualization
# Weighted CTR for Part 2
agg_ctr_p2 <- OGFile_Part2 %>%
group_by(Segment, Email_AB) %>%
summarise(mean_ctr = sum(UniqueClick) / sum(Delivered), .groups = "drop")
# Plot CTR
ggplot(agg_ctr_p2, aes(x = Segment, y = mean_ctr, fill = Email_AB)) +
geom_bar(stat = "identity", position = "dodge") +
ylab("CTR (Proportion)") +
ggtitle("CTR by Segment and Email Version (Part 2)") +
theme_minimal()
#Part 2 Booking_Rate Model
# Load required package
library(glm2)
# Fit the GLM model for Booking Rate (Part 2)
Booking_model_Part2 <- glm(
Booking_Rate ~ Segment * Email_AB, # Interaction between Segment and Email version
family = binomial, # Binomial for proportions
data = OGFile_Part2,
weights = Delivered # Weight by Delivered
)
# ANOVA with Chi-squared test
anova(Booking_model_Part2, test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Booking_Rate
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 41589 13402
## Segment 1 50.250 41588 13351 1.354e-12 ***
## Email_AB 3 5.846 41585 13346 0.11935
## Segment:Email_AB 3 8.951 41582 13337 0.02995 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Part 2 Booking_Rate Vizualization
# Weighted Booking Rate for Part 2
agg_booking_p2 <- OGFile_Part2 %>%
group_by(Segment, Email_AB) %>%
summarise(mean_booking = sum(MadeBooking) / sum(Delivered), .groups = "drop")
# Plot Booking Rate
ggplot(agg_booking_p2, aes(x = Segment, y = mean_booking, fill = Email_AB)) +
geom_bar(stat = "identity", position = "dodge") +
ylab("Booking Rate (Proportion)") +
ggtitle("Booking Rate by Segment and Email Version (Part 2)") +
theme_minimal()
#Part 2 Sample Size Table
# Summarize Delivered emails by Segment and Email_AB
assignment_table_Part2 <- OGFile_Part2 %>%
group_by(Segment, Email_AB) %>%
summarise(Emails_Delivered = sum(Delivered), .groups = "drop")
# View the table
print(assignment_table_Part2)
## # A tibble: 8 × 3
## Segment Email_AB Emails_Delivered
## <fct> <fct> <dbl>
## 1 L CTL 729110
## 2 L MFF 734156
## 3 L OBC 736717
## 4 L VSA 737893
## 5 N CTL 286787
## 6 N MFF 288606
## 7 N OBC 289297
## 8 N VSA 287562
#Part 2 Area Chart
# Load necessary packages
library(dplyr)
library(ggplot2)
library(tidyr)
# 1. Summarize Delivered by date, segment, and email version
delivery_summary_Part2 <- OGFile_Part2 %>%
group_by(EMAIL_DATE, Segment, Email_AB) %>%
summarise(Daily_Delivered = sum(Delivered), .groups = "drop") %>%
arrange(EMAIL_DATE)
# 2. Create a grouping variable (Segment + Email_AB)
delivery_summary_Part2 <- delivery_summary_Part2 %>%
mutate(Group = paste(Segment, Email_AB, sep = "-"))
# 3. Calculate cumulative sum within each group
delivery_summary_Part2 <- delivery_summary_Part2 %>%
group_by(Group) %>%
arrange(EMAIL_DATE) %>%
mutate(Cumulative_Delivered = cumsum(Daily_Delivered)) %>%
ungroup()
# 4. Plot the area chart
ggplot(delivery_summary_Part2, aes(x = EMAIL_DATE, y = Cumulative_Delivered, fill = Group)) +
geom_area() +
labs(title = "Cumulative Emails Delivered Over Time by Segment and Email Version",
x = "Email Date",
y = "Cumulative Emails Delivered") +
theme_minimal()