Introduction

# All the required libraries
library(ggplot2)  # for data visualisation 
library(magrittr) # for pipes 
library(dplyr)    # for data wrangling

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)    # for data wrangling

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:magrittr':
## 
##     extract

library(readr)    # for reading data
library(MVN)      # for outlier detection

Data

Read and inspect data:

# Read the data 
data <- read.csv("streaming_data.csv")
head(data)

##    date gender age social_metric time_since_signup demographic group
## 1 01/07      F  28             5              19.3           1     A
## 2 01/07      F  32             7              11.5           1     A
## 3 01/07      F  39             4               4.3           3     A
## 4 01/07      M  52            10               9.5           4     A
## 5 01/07      M  25             1              19.5           2     A
## 6 01/07      M  51             0              22.6           4     A
##   hours_watched
## 1          4.08
## 2          2.99
## 3          5.74
## 4          4.13
## 5          4.68
## 6          3.40

class(data$date)

## [1] "character"

class(data$gender)

## [1] "character"

class(data$age)

## [1] "integer"

class(data$social_metric)

## [1] "integer"

class(data$time_since_signup)

## [1] "numeric"

class(data$demographic)

## [1] "integer"

class(data$group)

## [1] "character"

class(data$hours_watched)

## [1] "numeric"

Step 1 - Correlation

Examine correlation between “hours_watched” and other variables in the dataset:

# Hours_watched vs date plot - gender specified
df <- data %>% separate(col = date, into = c("day", "month"), sep = "/")
is.data.frame(df)

## [1] TRUE

gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
                                                 height = 0.45), 
                      aes(x = df$day,
                          y = df$hours_watched, color = df$gender))
gg <- gg + labs(x = "Date", y = "Hours watched", color = "Gender")
gg

df$day %<>% as.numeric()
df$gender %<>% as.factor()
df$group %<>% as.factor()

# Hours_watched vs date correlation calculation
x <- as.numeric(df$day)
y <- as.numeric(df$hours_watched)

# Using built-in R function
r_coeff <- cor(x, y, method = 'pearson')

# Plot
title_text <- paste0("r_xy = ", round(r_coeff, 6))
plot(x, y, main = title_text)

# Hours_watched vs gender plot
gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
                                                 height = 0.45), 
                      aes(x = df$gender,
                          y = df$hours_watched))
gg <- gg + labs(x = "Gender", y = "Hours watched")
gg

# Hours_watched vs age plot
gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
                                                 height = 0.45), 
                      aes(x = df$age,
                          y = df$hours_watched))
gg <- gg + labs(x = "Age", y = "Hours watched")
gg

# Hours_watched vs age plot - gender specified
gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
                                                 height = 0.45), 
                      aes(x = df$age,
                          y = df$hours_watched, color = df$gender))
gg <- gg + labs(x = "Age", y = "Hours watched", color = "Gender")
gg

# Hours_watched vs age correlation calculation
x <- df$age
y <- df$hours_watched

# Using built-in R function
r_coeff <- cor(x, y, method = 'pearson')

# Plot
title_text <- paste0("r_xy = ", round(r_coeff, 6))
plot(x, y, main = title_text)

# Hours_watched vs social_metric
gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
                                                 height = 0.45), 
                      aes(x = df$social_metric,
                          y = df$hours_watched))
gg <- gg + labs(x = "Social metric", y = "Hours watched")
gg

# Hours_watched vs social_metric - gender specified
gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
                                                 height = 0.45), 
                      aes(x = df$social_metric,
                          y = df$hours_watched, color = df$gender))
gg <- gg + labs(x = "Social metric", y = "Hours watched", color = "Gender")
gg

# Hours_watched vs social_metric - age specified
gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
                                                 height = 0.45), 
                      aes(x = df$social_metric,
                          y = df$hours_watched, size = df$age))
gg <- gg + labs(x = "Social metric", y = "Hours watched", size = "Age")
gg

# Hours_watched vs social metric correlation calculation
x <- df$social_metric
y <- df$hours_watched

# Using built-in R function
r_coeff <- cor(x, y, method = 'pearson')

# Plot
title_text <- paste0("r_xy = ", round(r_coeff, 6))
plot(x, y, main = title_text)

# Hours_watched vs time_since_signup plot
gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
                                                 height = 0.45), 
                      aes(x = df$time_since_signup,
                          y = df$hours_watched))
gg <- gg + labs(x = "Time since signup", y = "Hours watched")
gg

# Hours_watched vs time_since_signup plot - gender specified
gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
                                                 height = 0.45), 
                      aes(x = df$time_since_signup,
                          y = df$hours_watched, color = df$gender))
gg <- gg + labs(x = "Time since signup", y = "Hours watched", color = "Gender")
gg

# Hours_watched vs time_since_signup plot - age specified
gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
                                                 height = 0.45), 
                      aes(x = df$time_since_signup,
                          y = df$hours_watched, size = df$age))
gg <- gg + labs(x = "Time since signup", y = "Hours watched", size = "Age")
gg

x <- df$time_since_signup
y <- df$hours_watched

# Using built-in R function
r_coeff <- cor(x, y, method = 'pearson')

# Plot
title_text <- paste0("r_xy = ", round(r_coeff, 6))
plot(x, y, main = title_text)

# Hours_watched vs demographic plot
gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
                                                 height = 0.45), 
                      aes(x = df$demographic,
                          y = df$hours_watched))
gg <- gg + labs(x = "Demographic", y = "Hours watched")
gg

# Hours_watched vs demographic plot - gender specified
gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
                                                 height = 0.45), 
                      aes(x = df$demographic,
                          y = df$hours_watched, color = df$gender))
gg <- gg + labs(x = "Demographic", y = "Hours watched", color = "Gender")
gg

# Hours_watched vs demographic plot - age specified
gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
                                                 height = 0.45), 
                      aes(x = df$demographic,
                          y = df$hours_watched, size = df$age))
gg <- gg + labs(x = "Demographic", y = "Hours watched", size = "Age")
gg

x <- df$demographic
y <- df$hours_watched

# Using built-in R function
r_coeff <- cor(x, y, method = 'pearson')

# Plot
title_text <- paste0("r_xy = ", round(r_coeff, 6))
plot(x, y, main = title_text)

Outlier detection and exclusion

The MVN method is used for the detection and removal outliers as they significantly impact the accuracy of the regression model:

# Create a subset of the complete dataset
df_mod <- df %>% select(age, social_metric, hours_watched)

# Apply MVN method to identify and separate outliers
result <- df_mod %>%            
  MVN::mvn(multivariateOutlierMethod = "quan",          
           showOutliers = TRUE,             
           showNewData = TRUE)

# Create a data frame from outliers and inspect
df_mod_outliers <- result$multivariateOutliers
dim(df_mod_outliers)

## [1] 26  3

# Create a data frame free from outliers and inspect
df_clean <- result$newData
dim(df_clean)

## [1] 974   3

# Visualise the impact of outlier removal: hours_watched vs age plot
gg <- ggplot()
gg <- gg + geom_point(aes(x = df_clean$age,
                          y = df_clean$hours_watched))
gg <- gg + labs(x = "Age", y = "Hours watched")
gg

# Visualise the impact of outlier removal: hours_watched vs social_metric plot
gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
                                                 height = 0.45), 
                      aes(x = df_clean$social_metric,
                          y = df_clean$hours_watched))
gg <- gg + labs(x = "Social metric", y = "Hours watched")
gg

Step 2 - Regression Model

# All variables included as predictors for multiple regression
reg_model0 <- lm(hours_watched ~ day + gender + age + social_metric + time_since_signup , data = df) 

summary(reg_model0)

## 
## Call:
## lm(formula = hours_watched ~ day + gender + age + social_metric + 
##     time_since_signup, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6874 -0.6423 -0.0177  0.7228  2.7709 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        6.265773   0.157325  39.827  < 2e-16 ***
## day                0.010651   0.003717   2.865  0.00425 ** 
## genderM            0.058156   0.067187   0.866  0.38693    
## age               -0.070746   0.003114 -22.720  < 2e-16 ***
## social_metric      0.096380   0.011116   8.670  < 2e-16 ***
## time_since_signup  0.002629   0.004602   0.571  0.56797    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.051 on 994 degrees of freedom
## Multiple R-squared:  0.3819, Adjusted R-squared:  0.3788 
## F-statistic: 122.8 on 5 and 994 DF,  p-value: < 2.2e-16

# Variables "gender, "age", "social_metric" and "time-since-signup" included as predictors for multiple regression
reg_model1 <- lm(hours_watched ~ gender + age + social_metric + time_since_signup , data = df) 

summary(reg_model1)

## 
## Call:
## lm(formula = hours_watched ~ gender + age + social_metric + time_since_signup, 
##     data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7065 -0.6631 -0.0282  0.7477  2.8066 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        6.440849   0.145499  44.267   <2e-16 ***
## genderM            0.059767   0.067428   0.886    0.376    
## age               -0.070865   0.003125 -22.678   <2e-16 ***
## social_metric      0.097112   0.011154   8.707   <2e-16 ***
## time_since_signup  0.002237   0.004617   0.484    0.628    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.055 on 995 degrees of freedom
## Multiple R-squared:  0.3768, Adjusted R-squared:  0.3743 
## F-statistic: 150.4 on 4 and 995 DF,  p-value: < 2.2e-16

# Variables "age" and "social_metric" included as predictors for multiple regression
reg_model2 <- lm(hours_watched ~ age + social_metric , data = df_clean) 

summary(reg_model2)

## 
## Call:
## lm(formula = hours_watched ~ age + social_metric, data = df_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.40338 -0.65653 -0.02291  0.69715  2.39167 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.500750   0.124951  52.026   <2e-16 ***
## age           -0.070474   0.002957 -23.829   <2e-16 ***
## social_metric  0.099106   0.010528   9.413   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9799 on 971 degrees of freedom
## Multiple R-squared:  0.4071, Adjusted R-squared:  0.4058 
## F-statistic: 333.3 on 2 and 971 DF,  p-value: < 2.2e-16

# Calculate coefficients of multiple regression model
a0 <- coef(reg_model2)[1]
a1 <- coef(reg_model2)[2]
a2 <- coef(reg_model2)[3]

Residuals

Residuals to the straight line fit are computed:

# Fit to each value of x (mathematical representation of the model)
df_clean$f1 <- a0 + a1 * df_clean$age + a2 * df_clean$social_metric

# Calculate the residual
df_clean$e1 <- df_clean$hours_watched - df_clean$f1

# Plot normal Q-Q for residuals
qqnorm(df_clean$e1)
qqline(df_clean$e1, col = 2)

Plot histogram to ensure the normal distribution of the residuals:

gg <- ggplot()
gg <- gg + geom_histogram(aes(x = df_clean$e1), bins = 10, colour = "black") 
gg <- gg + labs(title = "Residuals Plot", x = "Residuals") 
gg

Step 3 - Sampling Bias Investigation

Check group balances: As we didn’t setup these groups ourselves it is always best to double check that they represent the total population.

# Count the numbers in each demographic based on the A/B group
check_a_df <- df %>% 
  filter(group == 'A') %>% 
  select(gender, demographic) %>% 
  group_by(gender, demographic) %>%
  arrange(demographic) %>% 
  mutate(n_a = n()) %>% 
  distinct()

check_b_df <- df %>% 
  filter(group == 'B') %>% 
  select(gender, demographic) %>% 
  group_by(gender, demographic) %>%
  arrange(demographic) %>%
  mutate(n_b = n()) %>% 
  distinct()

# Total numbers in each group
n_total_a <- sum(df$group == 'A')
n_total_b <- sum(df$group == 'B')

# Proportions in each demographic
check_a_df$p_a <- check_a_df$n_a / n_total_a
check_b_df$p_b <- check_b_df$n_b / n_total_b

# Join the results
check_df <- inner_join(check_a_df, check_b_df)

## Joining, by = c("gender", "demographic")

# Calculate the difference in proportions
check_df$diff <- check_df$p_a - check_df$p_b
check_df

## # A tibble: 4 × 7
## # Groups:   gender, demographic [4]
##   gender demographic   n_a   p_a   n_b   p_b     diff
##   <fct>        <int> <int> <dbl> <int> <dbl>    <dbl>
## 1 F                1   203 0.231    13 0.108  0.122  
## 2 M                2   236 0.268    32 0.267  0.00152
## 3 F                3   197 0.224    16 0.133  0.0905 
## 4 M                4   244 0.277    59 0.492 -0.214

# Theoretically if there is no bias aside from sampling noise then the difference should be small and normally distributed
qqnorm(y = check_df$diff)
qqline(check_df$diff, col = 2)

An alternative for checking group balances

# Inspect group statistics
print(check_df)

## # A tibble: 4 × 7
## # Groups:   gender, demographic [4]
##   gender demographic   n_a   p_a   n_b   p_b     diff
##   <fct>        <int> <int> <dbl> <int> <dbl>    <dbl>
## 1 F                1   203 0.231    13 0.108  0.122  
## 2 M                2   236 0.268    32 0.267  0.00152
## 3 F                3   197 0.224    16 0.133  0.0905 
## 4 M                4   244 0.277    59 0.492 -0.214

# Check with visualisations
check_df$ID <- seq.int(nrow(check_df))
check_df

## # A tibble: 4 × 8
## # Groups:   gender, demographic [4]
##   gender demographic   n_a   p_a   n_b   p_b     diff    ID
##   <fct>        <int> <int> <dbl> <int> <dbl>    <dbl> <int>
## 1 F                1   203 0.231    13 0.108  0.122       1
## 2 M                2   236 0.268    32 0.267  0.00152     2
## 3 F                3   197 0.224    16 0.133  0.0905      3
## 4 M                4   244 0.277    59 0.492 -0.214       4

plot_df <- check_df %>% pivot_longer(c(p_a, p_b), "group", "probability")
plot_df

## # A tibble: 8 × 8
## # Groups:   gender, demographic [4]
##   gender demographic   n_a   n_b     diff    ID group value
##   <fct>        <int> <int> <int>    <dbl> <int> <chr> <dbl>
## 1 F                1   203    13  0.122       1 p_a   0.231
## 2 F                1   203    13  0.122       1 p_b   0.108
## 3 M                2   236    32  0.00152     2 p_a   0.268
## 4 M                2   236    32  0.00152     2 p_b   0.267
## 5 F                3   197    16  0.0905      3 p_a   0.224
## 6 F                3   197    16  0.0905      3 p_b   0.133
## 7 M                4   244    59 -0.214       4 p_a   0.277
## 8 M                4   244    59 -0.214       4 p_b   0.492

ggplot() + geom_col(aes(x=plot_df$ID, y=plot_df$value, fill=plot_df$group), position = "dodge") +  
  scale_fill_manual(values=c("#F87660", "#0038BA")) +
  labs(title = "Group Balances", x = "Demographic", y = "Value", fill = "Group")

From both the Q-Q plot and the Group Balances chart above, it can be concluded that there is considerable bias in the sampling that “Why Not Watch?” has implemented. This would mean that no hypothesis testing is applicable to the whole dataset as the data on hand is not an accurate representation of the population that we are seeking insights for.

Step 4 - Examine the effect size

Since the sample dataset is biased we cannot examine the effect on the entirety of the data. However, hypothesis testing can be conducted within each demographic group.

# Construct demo1 subset and inspect
demo1 <- df %>% filter(demographic == 1) %>% group_by(group) %>% 
  select(demographic, group, hours_watched) %>% summarise(mean = mean(hours_watched))
print(demo1)

## # A tibble: 2 × 2
##   group  mean
##   <fct> <dbl>
## 1 A      5.02
## 2 B      5.75

# Construct demo2 subset and inspect
demo2 <- df %>% filter(demographic == 2) %>% group_by(group) %>% 
  select(demographic, group, hours_watched) %>% summarise(mean = mean(hours_watched))
print(demo2)

## # A tibble: 2 × 2
##   group  mean
##   <fct> <dbl>
## 1 A      5.01
## 2 B      5.71

# Construct demo3 subset and inspect
demo3 <- df %>% filter(demographic == 3) %>% group_by(group) %>% 
  select(demographic, group, hours_watched) %>% summarise(mean = mean(hours_watched))
print(demo3)

## # A tibble: 2 × 2
##   group  mean
##   <fct> <dbl>
## 1 A      3.58
## 2 B      4.22

# Construct demo4 subset and inspect 
demo4 <- df %>% filter(demographic == 4) %>% group_by(group) %>% 
  select(demographic, group, hours_watched) %>% summarise(mean = mean(hours_watched))
print(demo4)

## # A tibble: 2 × 2
##   group  mean
##   <fct> <dbl>
## 1 A      3.72
## 2 B      4.28

Effect visualisation

# Combine subsets
demo1$demographic <- c(1,1)
demo2$demographic <- c(2,2)
demo3$demographic <- c(3,3)
demo4$demographic <- c(4,4)
demo_tot <- bind_rows(demo1, demo2, demo3, demo4)

ggplot() + geom_col(aes(x = demo_tot$demographic, y = demo_tot$mean, fill = demo_tot$group), position = "dodge") +
  scale_fill_manual(values=c("#F87660", "#0038BA")) +
  labs(title = "Effect visualisation", x = "Demographic", y = "Mean hours watched", fill = "Group")

Conclusion

From this AB test we can conclude that even though sampling was biased, diving deeper into demographic categories within the data reveals that WNW’s new recommendation engine algorithm has a positive impact in terms of average hours watched.

References

Neversaint 2013, How to change legend title in ggplot, Stack Overflow, viewed 25 February 2022, https://stackoverflow.com/questions/14622421/how-to-change-legend-title-in-ggplot.

Recommender System Analysis

Applied Analytics

Mohammad Younesi