MATH2406 - A3 RMD

LOADING & PRE-PROCESSING

# Load data
stream <- read_csv("data/streaming_data.csv")

# Convert date to date & add year
stream$year <- 2021
stream$date <- paste(stream$date, "/", stream$year)
stream$date <- as.Date(stream$date, format = "%d/%m / %Y")
stream <- stream [ ,-9]

# Convert gender to factor & fix levels to avoid R errors with use of "F"
stream$gender <- as.factor(stream$gender)
levels(stream$gender) <- list(`Female`  = "F", `Male` = "M") 

# Convert group to factor
stream$group <- as.factor(stream$group)

# Add factor variable based on date for pre-trial and trial
breaks <- as.Date(c("2021-07-01","2021-07-18","2021-07-31"))
stream$ab_trial <- cut(stream$date, breaks, labels = c("pre", "trial"))

# Rename variables for clarity & ease of use
stream <- rename(stream, tenure = time_since_signup)
stream <- rename(stream, demo = demographic)

# Check the age range of the dataset and explore potential definition of demographic variable
summary(stream$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   28.00   36.00   36.49   46.00   55.00

summary(stream$demo)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   2.603   4.000   4.000

stream <- stream %>%
          mutate(demo_chk = case_when(
                 between(age,18,35) ~ '18 - 35',
                 between(age,36,55) ~ '36 - 55'))

stream <- stream %>%
          mutate(demo_chk2 = case_when(
                demo_chk == '18 - 35' & gender == "Female" ~ 1,
                demo_chk == '18 - 35' & gender == "Male" ~ 2,
                demo_chk == '36 - 55' & gender == "Female" ~ 3,
                demo_chk == '36 - 55' & gender == "Male" ~ 4))


# Test if demo_chk is the same as original demo variable
identical(stream$demo, stream$demo_chk2)

## [1] TRUE

# Remove redundant test variables
stream <- stream [ ,-c(10, 11)]

# Add typical target marketing age groups variable for cross-market comparability
stream <- stream %>%
          mutate(age_group = case_when(
                 between(age,18,25) ~ '18 - 24',
                 between(age,25,34) ~ '25 - 34',
                 between(age,35,44) ~ '35 - 44',
                 between(age,45,55) ~ '45 - 55'))

stream$age_group <- factor(stream$age_group, 
                    levels=c('18 - 24','25 - 34', 
                             '35 - 44', '45 - 55'), ordered = TRUE)


# Reorder variables
stream <- stream[, c(1, 2, 3, 10, 6, 4, 5, 7, 9, 8)]

# Check changes
head(stream)

## # A tibble: 6 x 10
##   date       gender   age age_group  demo social_metric tenure group ab_trial
##   <date>     <fct>  <dbl> <ord>     <dbl>         <dbl>  <dbl> <fct> <fct>   
## 1 2021-07-01 Female    28 25 - 34       1             5   19.3 A     pre     
## 2 2021-07-01 Female    32 25 - 34       1             7   11.5 A     pre     
## 3 2021-07-01 Female    39 35 - 44       3             4    4.3 A     pre     
## 4 2021-07-01 Male      52 45 - 55       4            10    9.5 A     pre     
## 5 2021-07-01 Male      25 18 - 24       2             1   19.5 A     pre     
## 6 2021-07-01 Male      51 45 - 55       4             0   22.6 A     pre     
## # ... with 1 more variable: hours_watched <dbl>

ESTABLISH SAMPLE SUB GROUPS

# Create elements for each sub-group
total_trial <- stream %>%
               filter(ab_trial == "trial") %>%
               group_by(group) %>%
               count()

total_pre <- stream %>%
             filter(ab_trial == "pre") %>%
             group_by(group) %>%
             count()

A_group_pre <- stream %>%
               filter(group == "A") %>%
               filter(ab_trial == "pre")

A_group_trial <- stream %>%
                 filter(group == "A")

B_group_trial <- stream %>%
                 filter(group == "B")

EXPLORE SAMPLE SUB-GROUPS

# Output sample sizes & gender distribution in study for each demographic group 
# Call out gender (although reflected in demo) for ease of parsing

# Full sample
all_groups_n <- stream %>%
               ungroup() %>%
               select(demo, gender, hours_watched) %>%
               group_by(demo, gender) %>%
               mutate(n = n(), mu_hw = mean(hours_watched)) %>%
               select(demo, gender, n, mu_hw) %>%
               distinct()

n_total <- nrow(stream)
all_groups_n$p_x <- all_groups_n$n / n_total

# Group A Pre-Trial
A_group_pre_n <- stream %>%
               filter(group == "A") %>%
               filter(ab_trial == "pre") %>%
               ungroup() %>%
               select(demo, gender, hours_watched) %>%
               group_by(demo, gender) %>%
               mutate(n = n(), mu_hw = mean(hours_watched)) %>%
               select(demo, gender, n, mu_hw) %>%
               distinct()

n_total_a_pre <- stream %>%
               filter(group == "A") %>%
               filter(ab_trial == "pre") %>%
               nrow()

A_group_pre_n$p_ap <- A_group_pre_n$n / n_total_a_pre

# Group A Trial
A_group_trial_n <- stream %>%
               filter(group == "A") %>%
               filter(ab_trial == "trial") %>%
               ungroup() %>%
               select(demo, gender, hours_watched) %>%
               group_by(demo, gender) %>%
               mutate(n = n(), mu_hw = mean(hours_watched)) %>%
               select(demo, gender, n, mu_hw) %>%
               distinct()

n_total_a_trial <- stream %>%
               filter(group == "A") %>%
               filter(ab_trial == "trial") %>%
               nrow()

A_group_trial_n$p_at <- A_group_trial_n$n / n_total_a_trial

# Group B Trial
B_group_trial_n <- stream %>%
               filter(group == "B") %>%
               filter(ab_trial == "trial") %>%
               ungroup() %>%
               select(demo, gender, hours_watched) %>%
               group_by(demo, gender) %>%
               mutate(n = n(), mu_hw = mean(hours_watched)) %>%
               select(demo, gender, n, mu_hw) %>%
               distinct()

n_total_b_trial <- stream %>%
               filter(group == "B") %>%
               filter(ab_trial == "trial") %>%
               nrow()

B_group_trial_n$p_bt <- B_group_trial_n$n / n_total_b_trial

TANGENT : ILLUSTRATE UNIQUE ID ISSUE

# Sample dataset snip for Unique ID issue illustration
sample_b_unique_id <- stream %>%
                      filter(group == "B") %>%
                      filter(demo == "1")

COMPARE AGE DIST. TO ABS POPULATION DATA

# Explore age distribution of sample
stream_age <- stream %>%
              group_by(age) %>%
              count() %>%
              summarise(n = sum(n))

stream_age <- stream_age %>%
              mutate(px = n / sum(n))

stream_age$source <- paste("Sample")


# Load relevant ABS population data for comparison
abs_pop <- read_csv("data/abs_age.csv")
abs_pop <- abs_pop[-c(1:5,106:109),-c(3:12)]

abs_pop <- rename(abs_pop, age = `Australian Bureau of Statistics`)
abs_pop <- rename(abs_pop, n = `...2`)

abs_pop$n <- as.numeric(abs_pop$n)
abs_pop$age <- as.numeric(abs_pop$age)

abs_pop <- abs_pop %>%
           filter(age > 17 & age < 56)

# Create proportions variable
abs_pop <- abs_pop %>%
           mutate(px = n / sum(n))

abs_pop$source <- paste("ABS Data")


# Plot comparison of sample group vs ABS population data
age_px <- rbind(abs_pop, stream_age)

age_dist <- ggplot(age_px, aes(x=age, y=px, fill=source))+ 
            geom_bar(position="dodge", stat = "identity", width = 0.7) +
            scale_fill_manual(values = c("#C7D4DE", "#00AAFF")) +
            labs(title="Age Distribution of Trial vs. Australian Popoulation",x="\nAge", 
                 y = "Proportion of Population\n")+
            guides(fill=guide_legend(title="Source")) +
            scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) +
            scale_y_continuous(labels = scales::percent) +
            theme_minimal()

age_dist

OUTLIERS CHECK

# Review the target variable
summary(stream$hours_watched)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.500   3.530   4.415   4.393   5.322   8.300

# Check for outliers
z.scores <- stream$hours_watched %>%
  outliers::scores(type = "z")

# Output the number of outliers in either direction
length(which(abs(z.scores) >3))

## [1] 0

DISTRIBUTION CHECK

# Check if the sample has a normal distribution (can it be used for "S" sample sd?)
k <- sum(all_groups_n$n)

dist_plot <- ggplot() +
                  geom_histogram(aes(x = stream$hours_watched), binwidth = .5, 
                          colour = "white", fill = "cornflowerblue", size = 0.1) +
                  stat_function(fun = function(x) dnorm(x, mean = mean(stream$hours_watched), 
                                                        sd = sd(stream$hours_watched)) * k * .5, 
                         color = "darkred", size = 1) +
                  labs(title="Normal Distribution Test for All Groups",
                       x="\nHours Watched", y = "Frequency\n") +
                  theme_minimal()


dist_plot

qqnorm(stream$hours_watched,
       pch = 1, frame = FALSE,
       col = "blue3",
       main = "Normal Distribution Test for All Groups",
       xlab = "Theoretical Quantiles", ylab = "Hours Watched")
qqline(stream$hours_watched, col = "darkred", lwd = 2)

DEFINE MINIMUM SAMPLE SIZE

# Calculate minimum sample size
z_crit <- qnorm((1+0.95)/2)
round(z_crit, digits = 2)

## [1] 1.96

# Without population sd, we use the sample sd (as shown to be normally distributed)
sd_x <- sd(stream$hours_watched)

# Margin of error/effect calculated with (mu_treatment - mu_control)
e <- mean(B_group_trial$hours_watched)-mean(A_group_trial$hours_watched)

# Minimum sample size formula
n_ss <- ((z_crit*sd_x)/e)^2
n_ss <- ceiling(n_ss)
n_ss

## [1] 31

MINIMUM SAMPLE SIZE TESTING

# Check if all groups meet minimum sample size
A_group_pre_n$n > n_ss

## [1] TRUE TRUE TRUE TRUE

A_group_trial_n$n > n_ss

## [1] TRUE TRUE TRUE TRUE

B_group_trial_n$n > n_ss

## [1] FALSE  TRUE FALSE FALSE

# Look into the groups that do not meet minimum sample size for more information
B_group_trial_n

## # A tibble: 4 x 5
## # Groups:   demo, gender [4]
##    demo gender     n mu_hw  p_bt
##   <dbl> <fct>  <int> <dbl> <dbl>
## 1     3 Female    15  4.28 0.136
## 2     4 Male      54  4.40 0.491
## 3     1 Female    12  5.76 0.109
## 4     2 Male      29  5.70 0.264

# Noting the gender bias in the treatment group (Group B), test if re-splitting the data into smaller age groups without consideration for gender creates more specific/useful outcomes from the experiment by target market group.

# Create revised demo groups using typical marketing age brackets
# Full sample
all_groups_n_rev <- stream %>%
               ungroup() %>%
               select(age_group, hours_watched) %>%
               group_by(age_group) %>%
               mutate(n = n(), mu_hw = mean(hours_watched)) %>%
               select(age_group, n, mu_hw) %>%
               distinct()

all_groups_n_rev$p_x <- all_groups_n_rev$n / n_total

# Group A Pre-Trial
A_group_pre_n_rev <- stream %>%
               filter(group == "A") %>%
               filter(ab_trial == "pre") %>%
               ungroup() %>%
               select(age_group, hours_watched) %>%
               group_by(age_group) %>%
               mutate(n = n(), mu_hw = mean(hours_watched)) %>%
               select(age_group, n, mu_hw) %>%
               distinct()

A_group_pre_n_rev$p_ap <- A_group_pre_n_rev$n / n_total_a_pre


# Group A Trial
A_group_trial_n_rev <- stream %>%
               filter(group == "A") %>%
               filter(ab_trial == "trial") %>%
               ungroup() %>%
               select(age_group, hours_watched) %>%
               group_by(age_group) %>%
               mutate(n = n(), mu_hw = mean(hours_watched)) %>%
               select(age_group, n, mu_hw) %>%
               distinct()

A_group_trial_n_rev$p_at <- A_group_trial_n_rev$n / n_total_a_trial

# Group B Trial
B_group_trial_n_rev <- stream %>%
               filter(group == "B") %>%
               filter(ab_trial == "trial") %>%
               ungroup() %>%
               select(age_group, hours_watched) %>%
               group_by(age_group) %>%
               mutate(n = n(), mu_hw = mean(hours_watched)) %>%
               select(age_group, n, mu_hw) %>%
               distinct()

B_group_trial_n_rev$p_bt <- B_group_trial_n_rev$n / n_total_b_trial


# Check if these groups meet minimum sample size
A_group_pre_n_rev$n > n_ss

## [1] TRUE TRUE TRUE TRUE

A_group_trial_n_rev$n > n_ss

## [1] TRUE TRUE TRUE TRUE

B_group_trial_n_rev$n > n_ss

## [1]  TRUE  TRUE FALSE FALSE

# Look into the groups that do not meet minimum sample size for more information
B_group_trial_n_rev

## # A tibble: 4 x 4
## # Groups:   age_group [4]
##   age_group     n mu_hw  p_bt
##   <ord>     <int> <dbl> <dbl>
## 1 35 - 44      34  4.86 0.309
## 2 45 - 55      39  4.06 0.355
## 3 25 - 34      26  5.70 0.236
## 4 18 - 24      11  5.87 0.1

VISUALISE SAMPLE SIZES

# Visualise the sample sizes

# Add vectors for correct labelling of joined dataframes
cols_list_full  <- list(demo = "Demographic", n.x.x = "All Groups", n.y.x = "A Group Pre-Trial", 
                   n.x.y = "A Group Trial", n.y.y = "B Group Trial")
cols_list_trial <- list(demo = "Demographic", n.x = "A Group Trial", n.y = "B Group Trial")


# Full sample by Demographic
all_trial_1 <- inner_join(all_groups_n, A_group_pre_n, by = "demo")
all_trial_2 <- inner_join(A_group_trial_n, B_group_trial_n, by = "demo")

all_trial <- inner_join(all_trial_1, all_trial_2, by = "demo")
all_trial_viz <- all_trial %>%
                 select(demo, n.x.x, n.y.x, n.x.y, n.y.y)
colnames(all_trial_viz) <- cols_list_full
all_trial_viz <- pivot_longer(all_trial_viz, cols = 2:5)

all_trial_viz$Demographic <- as.factor(all_trial_viz$Demographic)
levels(all_trial_viz$Demographic) <- list('Female 18-35' = 1, 'Male 18-35' = 2, 'Female 36-55' = 3, 'Male 36-55' = 4)

all_trial_viz$name <- as.factor(all_trial_viz$name)
levels(all_trial_viz$name) <- list("All Groups" = "All Groups", "A Group Pre-Trial" = "A Group Pre-Trial", 
                                   "A Group Trial" = "A Group Trial", "B Group Trial" = "B Group Trial", ordered = TRUE)

total_s_viz <- ggplot(all_trial_viz, aes(x=Demographic, y=value, fill = name)) + 
               geom_bar(position="dodge", stat = "identity") +
               scale_fill_manual(values = c("#2ECC71", "#FCF3CF","#F7DC6F","#00AAFF")) +
               labs(title="Sample Sizes by Demographic",x="\nDemographic", y = "Sample Size\n") +
               geom_hline(yintercept=31, color = "darkred", size = 1) +
               theme_minimal()
               
total_s_viz

# Trial Groups by Demographic
trial_demo <- inner_join(A_group_trial_n, B_group_trial_n, by = "demo")
trial_demo_viz <- trial_demo %>%
                  select(demo, n.x, n.y)
colnames(trial_demo_viz) <- cols_list_trial
trial_demo_viz <- pivot_longer(trial_demo_viz, cols = 2:3)

trial_demo_viz$Demographic <- as.factor(trial_demo_viz$Demographic)
levels(trial_demo_viz$Demographic) <- list('Female 18-35' = 1, 'Male 18-35' = 2, 'Female 36-55' = 3, 'Male 36-55' = 4)

trial_s_viz <- ggplot(trial_demo_viz, aes(x=Demographic, y=value, fill = name)) + 
               geom_bar(position="dodge", stat = "identity") +
               scale_fill_manual(values = c("#F7DC6F","#00AAFF")) +
               labs(title="Sample Sizes by Demographic",x="\nDemographic", y = "Sample Size\n") +
               geom_hline(yintercept=31, color = "darkred", size = 1) +
               theme_minimal()
               
trial_s_viz

# Trial Groups by Age Group
trial_rev <- inner_join(A_group_trial_n_rev, B_group_trial_n_rev, by = "age_group")
trial_rev_viz <- trial_rev %>%
                  select(age_group, n.x, n.y)
colnames(trial_rev_viz) <- cols_list_trial
trial_rev_viz <- pivot_longer(trial_rev_viz, cols = 2:3)

trial_rev_s_viz <- ggplot(trial_rev_viz, aes(x=Demographic, y=value, fill = name)) + 
               geom_bar(position="dodge", stat = "identity") +
               scale_fill_manual(values = c("#F7DC6F","#00AAFF")) +
               labs(title="Sample Sizes by Age Group",x="\nAge Group", y = "Sample Size\n") +
               geom_hline(yintercept=31, color = "darkred", size = 1) +
               theme_minimal()
               
trial_rev_s_viz

EXPLORE CORRELATIONS

# Explore broad correlations to target variable over the whole sample
# For each of these studies, the null hypothesis is that the variables are independent

# Categorical vs. Numeric: Demographic
demo_corr <- aov(hours_watched~demo, data = stream)
summary(demo_corr)

##              Df Sum Sq Mean Sq F value              Pr(>F)    
## demo          1    332   332.0   229.6 <0.0000000000000002 ***
## Residuals   998   1443     1.4                                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Categorical vs. Numeric: Demographic
gender_corr <- aov(hours_watched~gender, data = stream)
summary(gender_corr)

##              Df Sum Sq Mean Sq F value Pr(>F)
## gender        1    1.3   1.251   0.704  0.402
## Residuals   998 1774.2   1.778

# Categorical vs. Numeric: Marketing Age Groups
agegr_corr <- aov(hours_watched~age_group, data = stream)
summary(agegr_corr)

##              Df Sum Sq Mean Sq F value              Pr(>F)    
## age_group     3    516  172.00     136 <0.0000000000000002 ***
## Residuals   996   1259    1.26                                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Categorical vs. Numeric: Trial (pre vs. post)
trial_corr <- aov(hours_watched~ab_trial, data = stream)
summary(trial_corr)

##              Df Sum Sq Mean Sq F value Pr(>F)  
## ab_trial      1   11.4  11.387     6.5 0.0109 *
## Residuals   965 1690.6   1.752                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 33 observations deleted due to missingness

# Categorical vs. Numeric: Group
group_corr <- aov(hours_watched~group, data = stream)
summary(group_corr)

##              Df Sum Sq Mean Sq F value   Pr(>F)    
## group         1   23.8  23.801   13.56 0.000243 ***
## Residuals   998 1751.6   1.755                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Numeric vs. Numeric: Age
age_corr <- cor(stream$hours_watched, stream$age, method="pearson")
round(age_corr, digits = 3)

## [1] -0.573

# Numeric vs. Numeric: Social Metric
sm_corr <- cor(stream$hours_watched, stream$social_metric, method="pearson")
round(sm_corr, digits = 3)

## [1] 0.233

# Numeric vs. Numeric: Tenure
tenure_corr <- cor(stream$hours_watched, stream$tenure, method="pearson")
round(tenure_corr, digits = 3)

## [1] -0.006

# Numeric vs. Numeric: Age & Social Metric
# We don't have a lot of information about this variable, so worth seeing if it is correlated 
# to Age and should be excluded from a multiple linear regression model
age_sm_corr <- cor(stream$age, stream$social_metric, method="pearson")
round(age_sm_corr, digits = 3)

## [1] -0.024

LINEAR & MULTIPLE LINEAR REGRESSION

# Explore confirmed correlations through simple linear regression
# Social metric is selected here as the Pearson Coefficient result suggests a correlation
sm_lm <- lm(hours_watched~social_metric, data = stream)
summary(sm_lm)

## 
## Call:
## lm(formula = hours_watched ~ social_metric, data = stream)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7986 -0.8364  0.0264  0.8577  3.6902 
## 
## Coefficients:
##               Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)    3.88366    0.07881   49.28 < 0.0000000000000002 ***
## social_metric  0.10373    0.01370    7.57   0.0000000000000848 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.297 on 998 degrees of freedom
## Multiple R-squared:  0.0543, Adjusted R-squared:  0.05335 
## F-statistic:  57.3 on 1 and 998 DF,  p-value: 0.00000000000008481

# Explore confirmed correlations through multiple linear regression
reg_model <- lm(hours_watched ~ group + age + social_metric, data = stream)
summary(reg_model)

## 
## Call:
## lm(formula = hours_watched ~ group + age + social_metric, data = stream)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6485 -0.6500 -0.0130  0.7169  2.9047 
## 
## Coefficients:
##                Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)    6.494742   0.129275  50.240 < 0.0000000000000002 ***
## groupB         0.643171   0.101042   6.365       0.000000000297 ***
## age           -0.072466   0.003072 -23.587 < 0.0000000000000002 ***
## social_metric  0.094722   0.010934   8.663 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.034 on 996 degrees of freedom
## Multiple R-squared:  0.4006, Adjusted R-squared:  0.3987 
## F-statistic: 221.8 on 3 and 996 DF,  p-value: < 0.00000000000000022

EFFECT / FINDINGS

# Compare the trial groups
# Overall, was there an increase in mean hours watched during the trial?
A_group_trial_mu_hw <- mean(A_group_trial$hours_watched)
B_group_trial_mu_hw <- mean(B_group_trial$hours_watched)

trial_eff <- (B_group_trial_mu_hw - A_group_trial_mu_hw) / A_group_trial_mu_hw * 100
trial_eff

## [1] 10.94872

# Was there any proportional difference in Group A behaviour before and during the trial?
A_group_pre_mu_hw <- mean(A_group_pre$hours_watched)

pre_trial_eff <- (A_group_trial_mu_hw - A_group_pre_mu_hw) / A_group_pre_mu_hw * 100
pre_trial_eff

## [1] 0.9279207

# Compare the proportional effect of the trial between demographic groups
trial_eff_demo <- inner_join(A_group_trial_n, B_group_trial_n, by = "demo")

trial_eff_demo$effect_mu_hw <- trial_eff_demo$mu_hw.y - trial_eff_demo$mu_hw.x
trial_eff_demo$effect_perc  <- trial_eff_demo$effect_mu_hw / trial_eff_demo$mu_hw.x * 100

# Add the minimum sample size as a variable
trial_eff_demo$n_ss <- n_ss
trial_eff_demo$significant <- trial_eff_demo$n.y > trial_eff_demo$n_ss

# Create a reduced summary dataframe for visualisation
summary_trial_eff_demo <- trial_eff_demo %>%
                          select(demo, gender.x, effect_mu_hw, effect_perc, significant)
summary_trial_eff_demo

## # A tibble: 4 x 5
## # Groups:   demo [4]
##    demo gender.x effect_mu_hw effect_perc significant
##   <dbl> <fct>           <dbl>       <dbl> <lgl>      
## 1     4 Male            0.731        19.9 TRUE       
## 2     3 Female          0.742        21.0 FALSE      
## 3     1 Female          0.590        11.4 FALSE      
## 4     2 Male            0.586        11.4 FALSE

# Compare the proportional effect of the trial between marketing age groups
trial_eff_rev <- inner_join(A_group_trial_n_rev, B_group_trial_n_rev, by = "age_group")

trial_eff_rev$effect_mu_hw <- trial_eff_rev$mu_hw.y - trial_eff_rev$mu_hw.x
trial_eff_rev$effect_perc  <- trial_eff_rev$effect_mu_hw / trial_eff_rev$mu_hw.x * 100

# Add the minimum sample size as a variable
trial_eff_rev$n_ss <- n_ss
trial_eff_rev$significant <- trial_eff_rev$n.y > trial_eff_rev$n_ss

# Create a reduced summary dataframe for visualisation
summary_trial_eff_rev <- trial_eff_rev %>%
                         select(age_group, effect_mu_hw, effect_perc, significant)
summary_trial_eff_rev

## # A tibble: 4 x 4
## # Groups:   age_group [4]
##   age_group effect_mu_hw effect_perc significant
##   <ord>            <dbl>       <dbl> <lgl>      
## 1 45 - 55          0.743       22.4  TRUE       
## 2 35 - 44          0.844       21.0  TRUE       
## 3 18 - 24          0.343        6.21 FALSE      
## 4 25 - 34          0.857       17.7  FALSE

VISUALISE EFFECT / FINDINGS

# Visualise the effect % change in each breakdown

# Assign levels to demo variable for clear plot labels
summary_trial_eff_demo$demo <- as.factor(summary_trial_eff_demo$demo)
levels(summary_trial_eff_demo$demo) <- list('Female 18-35' = 1, 'Male 18-35' = 2, 'Female 36-55' = 3, 'Male 36-55' = 4)

# Plot demographic based effect % change
p_summary_trial_eff_demo <- ggplot(summary_trial_eff_demo, aes(x = demo, y=0, fill = effect_perc)) + 
                           scale_fill_gradient2(low = '#E1F5FF', high = '0068FF', midpoint = 0, limits = range(0,25)) +
                           geom_tile(color = "transparent", size = 0.1) +
                           geom_text(aes(label = sprintf("%1.2f%%", effect_perc)), size=4, family = "mono", fontface = "bold") +
                           labs(x = "\nDemographic", y = "", fill = "% Increase in Hours Watched") +
                           coord_fixed(ratio = 1) +
                           theme_minimal()
p_summary_trial_eff_demo

# Plot age group based effect % change
p_summary_trial_eff_rev <- ggplot(summary_trial_eff_rev, aes(x = age_group, y=0, fill = effect_perc)) + 
                           scale_fill_gradient2(low = '#E1F5FF', high = '0068FF', midpoint = 0, limits = range(0,25)) +
                           geom_tile(color = "transparent", size = 0.1) +
                           geom_text(aes(label = sprintf("%1.2f%%", effect_perc)), size=4, family = "mono", fontface = "bold") +
                           labs(x = "\nAge Group", y = "", fill = "% Increase in Hours Watched") +
                           coord_fixed(ratio = 1) +
                           theme_minimal()
p_summary_trial_eff_rev

# Visualise the effect % change in each breakdown, excluding samples < minimum sample size

# Replace the non-significant values with 0 to demonstrate lack of data
summary_trial_eff_demo_plt <- summary_trial_eff_demo
summary_trial_eff_demo_plt$effect_perc[summary_trial_eff_demo_plt$significant != "TRUE"] <- 0

summary_trial_eff_rev_plt <- summary_trial_eff_rev
summary_trial_eff_rev_plt$effect_perc[summary_trial_eff_rev_plt$significant != "TRUE"] <- 0

# Plot demographic based effect % change
p_summary_trial_eff_demo2 <- ggplot(summary_trial_eff_demo_plt, aes(x = demo, y=0, fill = effect_perc)) + 
                           scale_fill_gradient2(low = '#E1F5FF', high = '0068FF', midpoint = 0, limits = range(0,25)) +
                           geom_tile(color = "transparent", size = 0.1) +
                           geom_text(aes(label = sprintf("%1.2f%%", effect_perc)), size=4, family = "mono", fontface = "bold") +
                           labs(x = "\nDemographic", y = "", fill = "% Increase in Hours Watched") +
                           coord_fixed(ratio = 1) +
                           theme_minimal()
p_summary_trial_eff_demo2

# Plot age group based effect % change
p_summary_trial_eff_rev2 <- ggplot(summary_trial_eff_rev_plt, aes(x = age_group, y=0, fill = effect_perc)) + 
                           scale_fill_gradient2(low = '#E1F5FF', high = '0068FF', midpoint = 0, limits = range(0,25)) +
                           geom_tile(color = "transparent", size = 0.1) +
                           geom_text(aes(label = sprintf("%1.2f%%", effect_perc)), size=4, family = "mono", fontface = "bold") +
                           labs(x = "\nAge Group", y = "", fill = "% Increase in Hours Watched") +
                           coord_fixed(ratio = 1) +
                           theme_minimal()
p_summary_trial_eff_rev2

QUANTIFY LEVEL OF CONFIDENCE TO USE ALL SAMPLES

# Calculate minimum sample size at 90% confidence
z_crit_90 <- qnorm((1+0.90)/2)
round(z_crit_90, digits = 2)

## [1] 1.64

# Minimum sample size formula
n_ss_90 <- ((z_crit_90*sd_x)/e)^2
n_ss_90 <- ceiling(n_ss_90)
n_ss_90

## [1] 22

# Calculate minimum sample size at 75% confidence
z_crit_75 <- qnorm((1+0.75)/2)
round(z_crit_75, digits = 2)

## [1] 1.15

# Minimum sample size formula
n_ss_75 <- ((z_crit_75*sd_x)/e)^2
n_ss_75 <- ceiling(n_ss_75)
n_ss_75

## [1] 11

# Test against Demographic groups sample sizes
trial_eff_demo$n_ss_75 <- n_ss_75
trial_eff_demo$n.y > trial_eff_demo$n_ss_75

## [1] TRUE TRUE TRUE TRUE

# Test against Age Groups sample sizes
trial_eff_rev$n_ss_90 <- n_ss_90
trial_eff_rev$n.y > trial_eff_rev$n_ss_90

## [1]  TRUE  TRUE FALSE  TRUE

MATH2406 - A3 RMD

s3910632

24/02/2022