LOADING & PRE-PROCESSING
# Load data
stream <- read_csv("data/streaming_data.csv")
# Convert date to date & add year
stream$year <- 2021
stream$date <- paste(stream$date, "/", stream$year)
stream$date <- as.Date(stream$date, format = "%d/%m / %Y")
stream <- stream [ ,-9]
# Convert gender to factor & fix levels to avoid R errors with use of "F"
stream$gender <- as.factor(stream$gender)
levels(stream$gender) <- list(`Female` = "F", `Male` = "M")
# Convert group to factor
stream$group <- as.factor(stream$group)
# Add factor variable based on date for pre-trial and trial
breaks <- as.Date(c("2021-07-01","2021-07-18","2021-07-31"))
stream$ab_trial <- cut(stream$date, breaks, labels = c("pre", "trial"))
# Rename variables for clarity & ease of use
stream <- rename(stream, tenure = time_since_signup)
stream <- rename(stream, demo = demographic)
# Check the age range of the dataset and explore potential definition of demographic variable
summary(stream$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 28.00 36.00 36.49 46.00 55.00
summary(stream$demo)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 2.603 4.000 4.000
stream <- stream %>%
mutate(demo_chk = case_when(
between(age,18,35) ~ '18 - 35',
between(age,36,55) ~ '36 - 55'))
stream <- stream %>%
mutate(demo_chk2 = case_when(
demo_chk == '18 - 35' & gender == "Female" ~ 1,
demo_chk == '18 - 35' & gender == "Male" ~ 2,
demo_chk == '36 - 55' & gender == "Female" ~ 3,
demo_chk == '36 - 55' & gender == "Male" ~ 4))
# Test if demo_chk is the same as original demo variable
identical(stream$demo, stream$demo_chk2)
## [1] TRUE
# Remove redundant test variables
stream <- stream [ ,-c(10, 11)]
# Add typical target marketing age groups variable for cross-market comparability
stream <- stream %>%
mutate(age_group = case_when(
between(age,18,25) ~ '18 - 24',
between(age,25,34) ~ '25 - 34',
between(age,35,44) ~ '35 - 44',
between(age,45,55) ~ '45 - 55'))
stream$age_group <- factor(stream$age_group,
levels=c('18 - 24','25 - 34',
'35 - 44', '45 - 55'), ordered = TRUE)
# Reorder variables
stream <- stream[, c(1, 2, 3, 10, 6, 4, 5, 7, 9, 8)]
# Check changes
head(stream)
## # A tibble: 6 x 10
## date gender age age_group demo social_metric tenure group ab_trial
## <date> <fct> <dbl> <ord> <dbl> <dbl> <dbl> <fct> <fct>
## 1 2021-07-01 Female 28 25 - 34 1 5 19.3 A pre
## 2 2021-07-01 Female 32 25 - 34 1 7 11.5 A pre
## 3 2021-07-01 Female 39 35 - 44 3 4 4.3 A pre
## 4 2021-07-01 Male 52 45 - 55 4 10 9.5 A pre
## 5 2021-07-01 Male 25 18 - 24 2 1 19.5 A pre
## 6 2021-07-01 Male 51 45 - 55 4 0 22.6 A pre
## # ... with 1 more variable: hours_watched <dbl>
ESTABLISH SAMPLE SUB GROUPS
# Create elements for each sub-group
total_trial <- stream %>%
filter(ab_trial == "trial") %>%
group_by(group) %>%
count()
total_pre <- stream %>%
filter(ab_trial == "pre") %>%
group_by(group) %>%
count()
A_group_pre <- stream %>%
filter(group == "A") %>%
filter(ab_trial == "pre")
A_group_trial <- stream %>%
filter(group == "A")
B_group_trial <- stream %>%
filter(group == "B")
EXPLORE SAMPLE SUB-GROUPS
# Output sample sizes & gender distribution in study for each demographic group
# Call out gender (although reflected in demo) for ease of parsing
# Full sample
all_groups_n <- stream %>%
ungroup() %>%
select(demo, gender, hours_watched) %>%
group_by(demo, gender) %>%
mutate(n = n(), mu_hw = mean(hours_watched)) %>%
select(demo, gender, n, mu_hw) %>%
distinct()
n_total <- nrow(stream)
all_groups_n$p_x <- all_groups_n$n / n_total
# Group A Pre-Trial
A_group_pre_n <- stream %>%
filter(group == "A") %>%
filter(ab_trial == "pre") %>%
ungroup() %>%
select(demo, gender, hours_watched) %>%
group_by(demo, gender) %>%
mutate(n = n(), mu_hw = mean(hours_watched)) %>%
select(demo, gender, n, mu_hw) %>%
distinct()
n_total_a_pre <- stream %>%
filter(group == "A") %>%
filter(ab_trial == "pre") %>%
nrow()
A_group_pre_n$p_ap <- A_group_pre_n$n / n_total_a_pre
# Group A Trial
A_group_trial_n <- stream %>%
filter(group == "A") %>%
filter(ab_trial == "trial") %>%
ungroup() %>%
select(demo, gender, hours_watched) %>%
group_by(demo, gender) %>%
mutate(n = n(), mu_hw = mean(hours_watched)) %>%
select(demo, gender, n, mu_hw) %>%
distinct()
n_total_a_trial <- stream %>%
filter(group == "A") %>%
filter(ab_trial == "trial") %>%
nrow()
A_group_trial_n$p_at <- A_group_trial_n$n / n_total_a_trial
# Group B Trial
B_group_trial_n <- stream %>%
filter(group == "B") %>%
filter(ab_trial == "trial") %>%
ungroup() %>%
select(demo, gender, hours_watched) %>%
group_by(demo, gender) %>%
mutate(n = n(), mu_hw = mean(hours_watched)) %>%
select(demo, gender, n, mu_hw) %>%
distinct()
n_total_b_trial <- stream %>%
filter(group == "B") %>%
filter(ab_trial == "trial") %>%
nrow()
B_group_trial_n$p_bt <- B_group_trial_n$n / n_total_b_trial
TANGENT : ILLUSTRATE UNIQUE ID ISSUE
# Sample dataset snip for Unique ID issue illustration
sample_b_unique_id <- stream %>%
filter(group == "B") %>%
filter(demo == "1")
COMPARE AGE DIST. TO ABS POPULATION DATA
# Explore age distribution of sample
stream_age <- stream %>%
group_by(age) %>%
count() %>%
summarise(n = sum(n))
stream_age <- stream_age %>%
mutate(px = n / sum(n))
stream_age$source <- paste("Sample")
# Load relevant ABS population data for comparison
abs_pop <- read_csv("data/abs_age.csv")
abs_pop <- abs_pop[-c(1:5,106:109),-c(3:12)]
abs_pop <- rename(abs_pop, age = `Australian Bureau of Statistics`)
abs_pop <- rename(abs_pop, n = `...2`)
abs_pop$n <- as.numeric(abs_pop$n)
abs_pop$age <- as.numeric(abs_pop$age)
abs_pop <- abs_pop %>%
filter(age > 17 & age < 56)
# Create proportions variable
abs_pop <- abs_pop %>%
mutate(px = n / sum(n))
abs_pop$source <- paste("ABS Data")
# Plot comparison of sample group vs ABS population data
age_px <- rbind(abs_pop, stream_age)
age_dist <- ggplot(age_px, aes(x=age, y=px, fill=source))+
geom_bar(position="dodge", stat = "identity", width = 0.7) +
scale_fill_manual(values = c("#C7D4DE", "#00AAFF")) +
labs(title="Age Distribution of Trial vs. Australian Popoulation",x="\nAge",
y = "Proportion of Population\n")+
guides(fill=guide_legend(title="Source")) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) +
scale_y_continuous(labels = scales::percent) +
theme_minimal()
age_dist
OUTLIERS CHECK
# Review the target variable
summary(stream$hours_watched)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.500 3.530 4.415 4.393 5.322 8.300
# Check for outliers
z.scores <- stream$hours_watched %>%
outliers::scores(type = "z")
# Output the number of outliers in either direction
length(which(abs(z.scores) >3))
## [1] 0
DISTRIBUTION CHECK
# Check if the sample has a normal distribution (can it be used for "S" sample sd?)
k <- sum(all_groups_n$n)
dist_plot <- ggplot() +
geom_histogram(aes(x = stream$hours_watched), binwidth = .5,
colour = "white", fill = "cornflowerblue", size = 0.1) +
stat_function(fun = function(x) dnorm(x, mean = mean(stream$hours_watched),
sd = sd(stream$hours_watched)) * k * .5,
color = "darkred", size = 1) +
labs(title="Normal Distribution Test for All Groups",
x="\nHours Watched", y = "Frequency\n") +
theme_minimal()
dist_plot
qqnorm(stream$hours_watched,
pch = 1, frame = FALSE,
col = "blue3",
main = "Normal Distribution Test for All Groups",
xlab = "Theoretical Quantiles", ylab = "Hours Watched")
qqline(stream$hours_watched, col = "darkred", lwd = 2)
DEFINE MINIMUM SAMPLE SIZE
# Calculate minimum sample size
z_crit <- qnorm((1+0.95)/2)
round(z_crit, digits = 2)
## [1] 1.96
# Without population sd, we use the sample sd (as shown to be normally distributed)
sd_x <- sd(stream$hours_watched)
# Margin of error/effect calculated with (mu_treatment - mu_control)
e <- mean(B_group_trial$hours_watched)-mean(A_group_trial$hours_watched)
# Minimum sample size formula
n_ss <- ((z_crit*sd_x)/e)^2
n_ss <- ceiling(n_ss)
n_ss
## [1] 31
MINIMUM SAMPLE SIZE TESTING
# Check if all groups meet minimum sample size
A_group_pre_n$n > n_ss
## [1] TRUE TRUE TRUE TRUE
A_group_trial_n$n > n_ss
## [1] TRUE TRUE TRUE TRUE
B_group_trial_n$n > n_ss
## [1] FALSE TRUE FALSE FALSE
# Look into the groups that do not meet minimum sample size for more information
B_group_trial_n
## # A tibble: 4 x 5
## # Groups: demo, gender [4]
## demo gender n mu_hw p_bt
## <dbl> <fct> <int> <dbl> <dbl>
## 1 3 Female 15 4.28 0.136
## 2 4 Male 54 4.40 0.491
## 3 1 Female 12 5.76 0.109
## 4 2 Male 29 5.70 0.264
# Noting the gender bias in the treatment group (Group B), test if re-splitting the data into smaller age groups without consideration for gender creates more specific/useful outcomes from the experiment by target market group.
# Create revised demo groups using typical marketing age brackets
# Full sample
all_groups_n_rev <- stream %>%
ungroup() %>%
select(age_group, hours_watched) %>%
group_by(age_group) %>%
mutate(n = n(), mu_hw = mean(hours_watched)) %>%
select(age_group, n, mu_hw) %>%
distinct()
all_groups_n_rev$p_x <- all_groups_n_rev$n / n_total
# Group A Pre-Trial
A_group_pre_n_rev <- stream %>%
filter(group == "A") %>%
filter(ab_trial == "pre") %>%
ungroup() %>%
select(age_group, hours_watched) %>%
group_by(age_group) %>%
mutate(n = n(), mu_hw = mean(hours_watched)) %>%
select(age_group, n, mu_hw) %>%
distinct()
A_group_pre_n_rev$p_ap <- A_group_pre_n_rev$n / n_total_a_pre
# Group A Trial
A_group_trial_n_rev <- stream %>%
filter(group == "A") %>%
filter(ab_trial == "trial") %>%
ungroup() %>%
select(age_group, hours_watched) %>%
group_by(age_group) %>%
mutate(n = n(), mu_hw = mean(hours_watched)) %>%
select(age_group, n, mu_hw) %>%
distinct()
A_group_trial_n_rev$p_at <- A_group_trial_n_rev$n / n_total_a_trial
# Group B Trial
B_group_trial_n_rev <- stream %>%
filter(group == "B") %>%
filter(ab_trial == "trial") %>%
ungroup() %>%
select(age_group, hours_watched) %>%
group_by(age_group) %>%
mutate(n = n(), mu_hw = mean(hours_watched)) %>%
select(age_group, n, mu_hw) %>%
distinct()
B_group_trial_n_rev$p_bt <- B_group_trial_n_rev$n / n_total_b_trial
# Check if these groups meet minimum sample size
A_group_pre_n_rev$n > n_ss
## [1] TRUE TRUE TRUE TRUE
A_group_trial_n_rev$n > n_ss
## [1] TRUE TRUE TRUE TRUE
B_group_trial_n_rev$n > n_ss
## [1] TRUE TRUE FALSE FALSE
# Look into the groups that do not meet minimum sample size for more information
B_group_trial_n_rev
## # A tibble: 4 x 4
## # Groups: age_group [4]
## age_group n mu_hw p_bt
## <ord> <int> <dbl> <dbl>
## 1 35 - 44 34 4.86 0.309
## 2 45 - 55 39 4.06 0.355
## 3 25 - 34 26 5.70 0.236
## 4 18 - 24 11 5.87 0.1
VISUALISE SAMPLE SIZES
# Visualise the sample sizes
# Add vectors for correct labelling of joined dataframes
cols_list_full <- list(demo = "Demographic", n.x.x = "All Groups", n.y.x = "A Group Pre-Trial",
n.x.y = "A Group Trial", n.y.y = "B Group Trial")
cols_list_trial <- list(demo = "Demographic", n.x = "A Group Trial", n.y = "B Group Trial")
# Full sample by Demographic
all_trial_1 <- inner_join(all_groups_n, A_group_pre_n, by = "demo")
all_trial_2 <- inner_join(A_group_trial_n, B_group_trial_n, by = "demo")
all_trial <- inner_join(all_trial_1, all_trial_2, by = "demo")
all_trial_viz <- all_trial %>%
select(demo, n.x.x, n.y.x, n.x.y, n.y.y)
colnames(all_trial_viz) <- cols_list_full
all_trial_viz <- pivot_longer(all_trial_viz, cols = 2:5)
all_trial_viz$Demographic <- as.factor(all_trial_viz$Demographic)
levels(all_trial_viz$Demographic) <- list('Female 18-35' = 1, 'Male 18-35' = 2, 'Female 36-55' = 3, 'Male 36-55' = 4)
all_trial_viz$name <- as.factor(all_trial_viz$name)
levels(all_trial_viz$name) <- list("All Groups" = "All Groups", "A Group Pre-Trial" = "A Group Pre-Trial",
"A Group Trial" = "A Group Trial", "B Group Trial" = "B Group Trial", ordered = TRUE)
total_s_viz <- ggplot(all_trial_viz, aes(x=Demographic, y=value, fill = name)) +
geom_bar(position="dodge", stat = "identity") +
scale_fill_manual(values = c("#2ECC71", "#FCF3CF","#F7DC6F","#00AAFF")) +
labs(title="Sample Sizes by Demographic",x="\nDemographic", y = "Sample Size\n") +
geom_hline(yintercept=31, color = "darkred", size = 1) +
theme_minimal()
total_s_viz
# Trial Groups by Demographic
trial_demo <- inner_join(A_group_trial_n, B_group_trial_n, by = "demo")
trial_demo_viz <- trial_demo %>%
select(demo, n.x, n.y)
colnames(trial_demo_viz) <- cols_list_trial
trial_demo_viz <- pivot_longer(trial_demo_viz, cols = 2:3)
trial_demo_viz$Demographic <- as.factor(trial_demo_viz$Demographic)
levels(trial_demo_viz$Demographic) <- list('Female 18-35' = 1, 'Male 18-35' = 2, 'Female 36-55' = 3, 'Male 36-55' = 4)
trial_s_viz <- ggplot(trial_demo_viz, aes(x=Demographic, y=value, fill = name)) +
geom_bar(position="dodge", stat = "identity") +
scale_fill_manual(values = c("#F7DC6F","#00AAFF")) +
labs(title="Sample Sizes by Demographic",x="\nDemographic", y = "Sample Size\n") +
geom_hline(yintercept=31, color = "darkred", size = 1) +
theme_minimal()
trial_s_viz
# Trial Groups by Age Group
trial_rev <- inner_join(A_group_trial_n_rev, B_group_trial_n_rev, by = "age_group")
trial_rev_viz <- trial_rev %>%
select(age_group, n.x, n.y)
colnames(trial_rev_viz) <- cols_list_trial
trial_rev_viz <- pivot_longer(trial_rev_viz, cols = 2:3)
trial_rev_s_viz <- ggplot(trial_rev_viz, aes(x=Demographic, y=value, fill = name)) +
geom_bar(position="dodge", stat = "identity") +
scale_fill_manual(values = c("#F7DC6F","#00AAFF")) +
labs(title="Sample Sizes by Age Group",x="\nAge Group", y = "Sample Size\n") +
geom_hline(yintercept=31, color = "darkred", size = 1) +
theme_minimal()
trial_rev_s_viz
EXPLORE CORRELATIONS
# Explore broad correlations to target variable over the whole sample
# For each of these studies, the null hypothesis is that the variables are independent
# Categorical vs. Numeric: Demographic
demo_corr <- aov(hours_watched~demo, data = stream)
summary(demo_corr)
## Df Sum Sq Mean Sq F value Pr(>F)
## demo 1 332 332.0 229.6 <0.0000000000000002 ***
## Residuals 998 1443 1.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Categorical vs. Numeric: Demographic
gender_corr <- aov(hours_watched~gender, data = stream)
summary(gender_corr)
## Df Sum Sq Mean Sq F value Pr(>F)
## gender 1 1.3 1.251 0.704 0.402
## Residuals 998 1774.2 1.778
# Categorical vs. Numeric: Marketing Age Groups
agegr_corr <- aov(hours_watched~age_group, data = stream)
summary(agegr_corr)
## Df Sum Sq Mean Sq F value Pr(>F)
## age_group 3 516 172.00 136 <0.0000000000000002 ***
## Residuals 996 1259 1.26
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Categorical vs. Numeric: Trial (pre vs. post)
trial_corr <- aov(hours_watched~ab_trial, data = stream)
summary(trial_corr)
## Df Sum Sq Mean Sq F value Pr(>F)
## ab_trial 1 11.4 11.387 6.5 0.0109 *
## Residuals 965 1690.6 1.752
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 33 observations deleted due to missingness
# Categorical vs. Numeric: Group
group_corr <- aov(hours_watched~group, data = stream)
summary(group_corr)
## Df Sum Sq Mean Sq F value Pr(>F)
## group 1 23.8 23.801 13.56 0.000243 ***
## Residuals 998 1751.6 1.755
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Numeric vs. Numeric: Age
age_corr <- cor(stream$hours_watched, stream$age, method="pearson")
round(age_corr, digits = 3)
## [1] -0.573
# Numeric vs. Numeric: Social Metric
sm_corr <- cor(stream$hours_watched, stream$social_metric, method="pearson")
round(sm_corr, digits = 3)
## [1] 0.233
# Numeric vs. Numeric: Tenure
tenure_corr <- cor(stream$hours_watched, stream$tenure, method="pearson")
round(tenure_corr, digits = 3)
## [1] -0.006
# Numeric vs. Numeric: Age & Social Metric
# We don't have a lot of information about this variable, so worth seeing if it is correlated
# to Age and should be excluded from a multiple linear regression model
age_sm_corr <- cor(stream$age, stream$social_metric, method="pearson")
round(age_sm_corr, digits = 3)
## [1] -0.024
LINEAR & MULTIPLE LINEAR REGRESSION
# Explore confirmed correlations through simple linear regression
# Social metric is selected here as the Pearson Coefficient result suggests a correlation
sm_lm <- lm(hours_watched~social_metric, data = stream)
summary(sm_lm)
##
## Call:
## lm(formula = hours_watched ~ social_metric, data = stream)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7986 -0.8364 0.0264 0.8577 3.6902
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.88366 0.07881 49.28 < 0.0000000000000002 ***
## social_metric 0.10373 0.01370 7.57 0.0000000000000848 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.297 on 998 degrees of freedom
## Multiple R-squared: 0.0543, Adjusted R-squared: 0.05335
## F-statistic: 57.3 on 1 and 998 DF, p-value: 0.00000000000008481
# Explore confirmed correlations through multiple linear regression
reg_model <- lm(hours_watched ~ group + age + social_metric, data = stream)
summary(reg_model)
##
## Call:
## lm(formula = hours_watched ~ group + age + social_metric, data = stream)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6485 -0.6500 -0.0130 0.7169 2.9047
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.494742 0.129275 50.240 < 0.0000000000000002 ***
## groupB 0.643171 0.101042 6.365 0.000000000297 ***
## age -0.072466 0.003072 -23.587 < 0.0000000000000002 ***
## social_metric 0.094722 0.010934 8.663 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.034 on 996 degrees of freedom
## Multiple R-squared: 0.4006, Adjusted R-squared: 0.3987
## F-statistic: 221.8 on 3 and 996 DF, p-value: < 0.00000000000000022
EFFECT / FINDINGS
# Compare the trial groups
# Overall, was there an increase in mean hours watched during the trial?
A_group_trial_mu_hw <- mean(A_group_trial$hours_watched)
B_group_trial_mu_hw <- mean(B_group_trial$hours_watched)
trial_eff <- (B_group_trial_mu_hw - A_group_trial_mu_hw) / A_group_trial_mu_hw * 100
trial_eff
## [1] 10.94872
# Was there any proportional difference in Group A behaviour before and during the trial?
A_group_pre_mu_hw <- mean(A_group_pre$hours_watched)
pre_trial_eff <- (A_group_trial_mu_hw - A_group_pre_mu_hw) / A_group_pre_mu_hw * 100
pre_trial_eff
## [1] 0.9279207
# Compare the proportional effect of the trial between demographic groups
trial_eff_demo <- inner_join(A_group_trial_n, B_group_trial_n, by = "demo")
trial_eff_demo$effect_mu_hw <- trial_eff_demo$mu_hw.y - trial_eff_demo$mu_hw.x
trial_eff_demo$effect_perc <- trial_eff_demo$effect_mu_hw / trial_eff_demo$mu_hw.x * 100
# Add the minimum sample size as a variable
trial_eff_demo$n_ss <- n_ss
trial_eff_demo$significant <- trial_eff_demo$n.y > trial_eff_demo$n_ss
# Create a reduced summary dataframe for visualisation
summary_trial_eff_demo <- trial_eff_demo %>%
select(demo, gender.x, effect_mu_hw, effect_perc, significant)
summary_trial_eff_demo
## # A tibble: 4 x 5
## # Groups: demo [4]
## demo gender.x effect_mu_hw effect_perc significant
## <dbl> <fct> <dbl> <dbl> <lgl>
## 1 4 Male 0.731 19.9 TRUE
## 2 3 Female 0.742 21.0 FALSE
## 3 1 Female 0.590 11.4 FALSE
## 4 2 Male 0.586 11.4 FALSE
# Compare the proportional effect of the trial between marketing age groups
trial_eff_rev <- inner_join(A_group_trial_n_rev, B_group_trial_n_rev, by = "age_group")
trial_eff_rev$effect_mu_hw <- trial_eff_rev$mu_hw.y - trial_eff_rev$mu_hw.x
trial_eff_rev$effect_perc <- trial_eff_rev$effect_mu_hw / trial_eff_rev$mu_hw.x * 100
# Add the minimum sample size as a variable
trial_eff_rev$n_ss <- n_ss
trial_eff_rev$significant <- trial_eff_rev$n.y > trial_eff_rev$n_ss
# Create a reduced summary dataframe for visualisation
summary_trial_eff_rev <- trial_eff_rev %>%
select(age_group, effect_mu_hw, effect_perc, significant)
summary_trial_eff_rev
## # A tibble: 4 x 4
## # Groups: age_group [4]
## age_group effect_mu_hw effect_perc significant
## <ord> <dbl> <dbl> <lgl>
## 1 45 - 55 0.743 22.4 TRUE
## 2 35 - 44 0.844 21.0 TRUE
## 3 18 - 24 0.343 6.21 FALSE
## 4 25 - 34 0.857 17.7 FALSE
VISUALISE EFFECT / FINDINGS
# Visualise the effect % change in each breakdown
# Assign levels to demo variable for clear plot labels
summary_trial_eff_demo$demo <- as.factor(summary_trial_eff_demo$demo)
levels(summary_trial_eff_demo$demo) <- list('Female 18-35' = 1, 'Male 18-35' = 2, 'Female 36-55' = 3, 'Male 36-55' = 4)
# Plot demographic based effect % change
p_summary_trial_eff_demo <- ggplot(summary_trial_eff_demo, aes(x = demo, y=0, fill = effect_perc)) +
scale_fill_gradient2(low = '#E1F5FF', high = '0068FF', midpoint = 0, limits = range(0,25)) +
geom_tile(color = "transparent", size = 0.1) +
geom_text(aes(label = sprintf("%1.2f%%", effect_perc)), size=4, family = "mono", fontface = "bold") +
labs(x = "\nDemographic", y = "", fill = "% Increase in Hours Watched") +
coord_fixed(ratio = 1) +
theme_minimal()
p_summary_trial_eff_demo
# Plot age group based effect % change
p_summary_trial_eff_rev <- ggplot(summary_trial_eff_rev, aes(x = age_group, y=0, fill = effect_perc)) +
scale_fill_gradient2(low = '#E1F5FF', high = '0068FF', midpoint = 0, limits = range(0,25)) +
geom_tile(color = "transparent", size = 0.1) +
geom_text(aes(label = sprintf("%1.2f%%", effect_perc)), size=4, family = "mono", fontface = "bold") +
labs(x = "\nAge Group", y = "", fill = "% Increase in Hours Watched") +
coord_fixed(ratio = 1) +
theme_minimal()
p_summary_trial_eff_rev
# Visualise the effect % change in each breakdown, excluding samples < minimum sample size
# Replace the non-significant values with 0 to demonstrate lack of data
summary_trial_eff_demo_plt <- summary_trial_eff_demo
summary_trial_eff_demo_plt$effect_perc[summary_trial_eff_demo_plt$significant != "TRUE"] <- 0
summary_trial_eff_rev_plt <- summary_trial_eff_rev
summary_trial_eff_rev_plt$effect_perc[summary_trial_eff_rev_plt$significant != "TRUE"] <- 0
# Plot demographic based effect % change
p_summary_trial_eff_demo2 <- ggplot(summary_trial_eff_demo_plt, aes(x = demo, y=0, fill = effect_perc)) +
scale_fill_gradient2(low = '#E1F5FF', high = '0068FF', midpoint = 0, limits = range(0,25)) +
geom_tile(color = "transparent", size = 0.1) +
geom_text(aes(label = sprintf("%1.2f%%", effect_perc)), size=4, family = "mono", fontface = "bold") +
labs(x = "\nDemographic", y = "", fill = "% Increase in Hours Watched") +
coord_fixed(ratio = 1) +
theme_minimal()
p_summary_trial_eff_demo2
# Plot age group based effect % change
p_summary_trial_eff_rev2 <- ggplot(summary_trial_eff_rev_plt, aes(x = age_group, y=0, fill = effect_perc)) +
scale_fill_gradient2(low = '#E1F5FF', high = '0068FF', midpoint = 0, limits = range(0,25)) +
geom_tile(color = "transparent", size = 0.1) +
geom_text(aes(label = sprintf("%1.2f%%", effect_perc)), size=4, family = "mono", fontface = "bold") +
labs(x = "\nAge Group", y = "", fill = "% Increase in Hours Watched") +
coord_fixed(ratio = 1) +
theme_minimal()
p_summary_trial_eff_rev2
QUANTIFY LEVEL OF CONFIDENCE TO USE ALL SAMPLES
# Calculate minimum sample size at 90% confidence
z_crit_90 <- qnorm((1+0.90)/2)
round(z_crit_90, digits = 2)
## [1] 1.64
# Minimum sample size formula
n_ss_90 <- ((z_crit_90*sd_x)/e)^2
n_ss_90 <- ceiling(n_ss_90)
n_ss_90
## [1] 22
# Calculate minimum sample size at 75% confidence
z_crit_75 <- qnorm((1+0.75)/2)
round(z_crit_75, digits = 2)
## [1] 1.15
# Minimum sample size formula
n_ss_75 <- ((z_crit_75*sd_x)/e)^2
n_ss_75 <- ceiling(n_ss_75)
n_ss_75
## [1] 11
# Test against Demographic groups sample sizes
trial_eff_demo$n_ss_75 <- n_ss_75
trial_eff_demo$n.y > trial_eff_demo$n_ss_75
## [1] TRUE TRUE TRUE TRUE
# Test against Age Groups sample sizes
trial_eff_rev$n_ss_90 <- n_ss_90
trial_eff_rev$n.y > trial_eff_rev$n_ss_90
## [1] TRUE TRUE FALSE TRUE