# All the required libraries
library(ggplot2) # for data visualisation
library(magrittr) # for pipes
library(dplyr) # for data wrangling
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr) # for data wrangling
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
##
## extract
library(readr) # for reading data
library(MVN) # for outlier detection
Read and inspect data:
# Read the data
data <- read.csv("streaming_data.csv")
head(data)
## date gender age social_metric time_since_signup demographic group
## 1 01/07 F 28 5 19.3 1 A
## 2 01/07 F 32 7 11.5 1 A
## 3 01/07 F 39 4 4.3 3 A
## 4 01/07 M 52 10 9.5 4 A
## 5 01/07 M 25 1 19.5 2 A
## 6 01/07 M 51 0 22.6 4 A
## hours_watched
## 1 4.08
## 2 2.99
## 3 5.74
## 4 4.13
## 5 4.68
## 6 3.40
class(data$date)
## [1] "character"
class(data$gender)
## [1] "character"
class(data$age)
## [1] "integer"
class(data$social_metric)
## [1] "integer"
class(data$time_since_signup)
## [1] "numeric"
class(data$demographic)
## [1] "integer"
class(data$group)
## [1] "character"
class(data$hours_watched)
## [1] "numeric"
Examine correlation between “hours_watched” and other variables in the dataset:
# Hours_watched vs date plot - gender specified
df <- data %>% separate(col = date, into = c("day", "month"), sep = "/")
is.data.frame(df)
## [1] TRUE
gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
height = 0.45),
aes(x = df$day,
y = df$hours_watched, color = df$gender))
gg <- gg + labs(x = "Date", y = "Hours watched", color = "Gender")
gg
df$day %<>% as.numeric()
df$gender %<>% as.factor()
df$group %<>% as.factor()
# Hours_watched vs date correlation calculation
x <- as.numeric(df$day)
y <- as.numeric(df$hours_watched)
# Using built-in R function
r_coeff <- cor(x, y, method = 'pearson')
# Plot
title_text <- paste0("r_xy = ", round(r_coeff, 6))
plot(x, y, main = title_text)
# Hours_watched vs gender plot
gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
height = 0.45),
aes(x = df$gender,
y = df$hours_watched))
gg <- gg + labs(x = "Gender", y = "Hours watched")
gg
# Hours_watched vs age plot
gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
height = 0.45),
aes(x = df$age,
y = df$hours_watched))
gg <- gg + labs(x = "Age", y = "Hours watched")
gg
# Hours_watched vs age plot - gender specified
gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
height = 0.45),
aes(x = df$age,
y = df$hours_watched, color = df$gender))
gg <- gg + labs(x = "Age", y = "Hours watched", color = "Gender")
gg
# Hours_watched vs age correlation calculation
x <- df$age
y <- df$hours_watched
# Using built-in R function
r_coeff <- cor(x, y, method = 'pearson')
# Plot
title_text <- paste0("r_xy = ", round(r_coeff, 6))
plot(x, y, main = title_text)
# Hours_watched vs social_metric
gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
height = 0.45),
aes(x = df$social_metric,
y = df$hours_watched))
gg <- gg + labs(x = "Social metric", y = "Hours watched")
gg
# Hours_watched vs social_metric - gender specified
gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
height = 0.45),
aes(x = df$social_metric,
y = df$hours_watched, color = df$gender))
gg <- gg + labs(x = "Social metric", y = "Hours watched", color = "Gender")
gg
# Hours_watched vs social_metric - age specified
gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
height = 0.45),
aes(x = df$social_metric,
y = df$hours_watched, size = df$age))
gg <- gg + labs(x = "Social metric", y = "Hours watched", size = "Age")
gg
# Hours_watched vs social metric correlation calculation
x <- df$social_metric
y <- df$hours_watched
# Using built-in R function
r_coeff <- cor(x, y, method = 'pearson')
# Plot
title_text <- paste0("r_xy = ", round(r_coeff, 6))
plot(x, y, main = title_text)
# Hours_watched vs time_since_signup plot
gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
height = 0.45),
aes(x = df$time_since_signup,
y = df$hours_watched))
gg <- gg + labs(x = "Time since signup", y = "Hours watched")
gg
# Hours_watched vs time_since_signup plot - gender specified
gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
height = 0.45),
aes(x = df$time_since_signup,
y = df$hours_watched, color = df$gender))
gg <- gg + labs(x = "Time since signup", y = "Hours watched", color = "Gender")
gg
# Hours_watched vs time_since_signup plot - age specified
gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
height = 0.45),
aes(x = df$time_since_signup,
y = df$hours_watched, size = df$age))
gg <- gg + labs(x = "Time since signup", y = "Hours watched", size = "Age")
gg
x <- df$time_since_signup
y <- df$hours_watched
# Using built-in R function
r_coeff <- cor(x, y, method = 'pearson')
# Plot
title_text <- paste0("r_xy = ", round(r_coeff, 6))
plot(x, y, main = title_text)
# Hours_watched vs demographic plot
gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
height = 0.45),
aes(x = df$demographic,
y = df$hours_watched))
gg <- gg + labs(x = "Demographic", y = "Hours watched")
gg
# Hours_watched vs demographic plot - gender specified
gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
height = 0.45),
aes(x = df$demographic,
y = df$hours_watched, color = df$gender))
gg <- gg + labs(x = "Demographic", y = "Hours watched", color = "Gender")
gg
# Hours_watched vs demographic plot - age specified
gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
height = 0.45),
aes(x = df$demographic,
y = df$hours_watched, size = df$age))
gg <- gg + labs(x = "Demographic", y = "Hours watched", size = "Age")
gg
x <- df$demographic
y <- df$hours_watched
# Using built-in R function
r_coeff <- cor(x, y, method = 'pearson')
# Plot
title_text <- paste0("r_xy = ", round(r_coeff, 6))
plot(x, y, main = title_text)
The MVN method is used for the detection and removal outliers as they significantly impact the accuracy of the regression model:
# Create a subset of the complete dataset
df_mod <- df %>% select(age, social_metric, hours_watched)
# Apply MVN method to identify and separate outliers
result <- df_mod %>%
MVN::mvn(multivariateOutlierMethod = "quan",
showOutliers = TRUE,
showNewData = TRUE)
# Create a data frame from outliers and inspect
df_mod_outliers <- result$multivariateOutliers
dim(df_mod_outliers)
## [1] 26 3
# Create a data frame free from outliers and inspect
df_clean <- result$newData
dim(df_clean)
## [1] 974 3
# Visualise the impact of outlier removal: hours_watched vs age plot
gg <- ggplot()
gg <- gg + geom_point(aes(x = df_clean$age,
y = df_clean$hours_watched))
gg <- gg + labs(x = "Age", y = "Hours watched")
gg
# Visualise the impact of outlier removal: hours_watched vs social_metric plot
gg <- ggplot()
gg <- gg + geom_point(position = position_jitter(width = 0.45,
height = 0.45),
aes(x = df_clean$social_metric,
y = df_clean$hours_watched))
gg <- gg + labs(x = "Social metric", y = "Hours watched")
gg
# All variables included as predictors for multiple regression
reg_model0 <- lm(hours_watched ~ day + gender + age + social_metric + time_since_signup , data = df)
summary(reg_model0)
##
## Call:
## lm(formula = hours_watched ~ day + gender + age + social_metric +
## time_since_signup, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6874 -0.6423 -0.0177 0.7228 2.7709
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.265773 0.157325 39.827 < 2e-16 ***
## day 0.010651 0.003717 2.865 0.00425 **
## genderM 0.058156 0.067187 0.866 0.38693
## age -0.070746 0.003114 -22.720 < 2e-16 ***
## social_metric 0.096380 0.011116 8.670 < 2e-16 ***
## time_since_signup 0.002629 0.004602 0.571 0.56797
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.051 on 994 degrees of freedom
## Multiple R-squared: 0.3819, Adjusted R-squared: 0.3788
## F-statistic: 122.8 on 5 and 994 DF, p-value: < 2.2e-16
# Variables "gender, "age", "social_metric" and "time-since-signup" included as predictors for multiple regression
reg_model1 <- lm(hours_watched ~ gender + age + social_metric + time_since_signup , data = df)
summary(reg_model1)
##
## Call:
## lm(formula = hours_watched ~ gender + age + social_metric + time_since_signup,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7065 -0.6631 -0.0282 0.7477 2.8066
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.440849 0.145499 44.267 <2e-16 ***
## genderM 0.059767 0.067428 0.886 0.376
## age -0.070865 0.003125 -22.678 <2e-16 ***
## social_metric 0.097112 0.011154 8.707 <2e-16 ***
## time_since_signup 0.002237 0.004617 0.484 0.628
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.055 on 995 degrees of freedom
## Multiple R-squared: 0.3768, Adjusted R-squared: 0.3743
## F-statistic: 150.4 on 4 and 995 DF, p-value: < 2.2e-16
# Variables "age" and "social_metric" included as predictors for multiple regression
reg_model2 <- lm(hours_watched ~ age + social_metric , data = df_clean)
summary(reg_model2)
##
## Call:
## lm(formula = hours_watched ~ age + social_metric, data = df_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.40338 -0.65653 -0.02291 0.69715 2.39167
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.500750 0.124951 52.026 <2e-16 ***
## age -0.070474 0.002957 -23.829 <2e-16 ***
## social_metric 0.099106 0.010528 9.413 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9799 on 971 degrees of freedom
## Multiple R-squared: 0.4071, Adjusted R-squared: 0.4058
## F-statistic: 333.3 on 2 and 971 DF, p-value: < 2.2e-16
# Calculate coefficients of multiple regression model
a0 <- coef(reg_model2)[1]
a1 <- coef(reg_model2)[2]
a2 <- coef(reg_model2)[3]
Residuals to the straight line fit are computed:
# Fit to each value of x (mathematical representation of the model)
df_clean$f1 <- a0 + a1 * df_clean$age + a2 * df_clean$social_metric
# Calculate the residual
df_clean$e1 <- df_clean$hours_watched - df_clean$f1
# Plot normal Q-Q for residuals
qqnorm(df_clean$e1)
qqline(df_clean$e1, col = 2)
Plot histogram to ensure the normal distribution of the residuals:
gg <- ggplot()
gg <- gg + geom_histogram(aes(x = df_clean$e1), bins = 10, colour = "black")
gg <- gg + labs(title = "Residuals Plot", x = "Residuals")
gg
Check group balances: As we didn’t setup these groups ourselves it is always best to double check that they represent the total population.
# Count the numbers in each demographic based on the A/B group
check_a_df <- df %>%
filter(group == 'A') %>%
select(gender, demographic) %>%
group_by(gender, demographic) %>%
arrange(demographic) %>%
mutate(n_a = n()) %>%
distinct()
check_b_df <- df %>%
filter(group == 'B') %>%
select(gender, demographic) %>%
group_by(gender, demographic) %>%
arrange(demographic) %>%
mutate(n_b = n()) %>%
distinct()
# Total numbers in each group
n_total_a <- sum(df$group == 'A')
n_total_b <- sum(df$group == 'B')
# Proportions in each demographic
check_a_df$p_a <- check_a_df$n_a / n_total_a
check_b_df$p_b <- check_b_df$n_b / n_total_b
# Join the results
check_df <- inner_join(check_a_df, check_b_df)
## Joining, by = c("gender", "demographic")
# Calculate the difference in proportions
check_df$diff <- check_df$p_a - check_df$p_b
check_df
## # A tibble: 4 × 7
## # Groups: gender, demographic [4]
## gender demographic n_a p_a n_b p_b diff
## <fct> <int> <int> <dbl> <int> <dbl> <dbl>
## 1 F 1 203 0.231 13 0.108 0.122
## 2 M 2 236 0.268 32 0.267 0.00152
## 3 F 3 197 0.224 16 0.133 0.0905
## 4 M 4 244 0.277 59 0.492 -0.214
# Theoretically if there is no bias aside from sampling noise then the difference should be small and normally distributed
qqnorm(y = check_df$diff)
qqline(check_df$diff, col = 2)
# Inspect group statistics
print(check_df)
## # A tibble: 4 × 7
## # Groups: gender, demographic [4]
## gender demographic n_a p_a n_b p_b diff
## <fct> <int> <int> <dbl> <int> <dbl> <dbl>
## 1 F 1 203 0.231 13 0.108 0.122
## 2 M 2 236 0.268 32 0.267 0.00152
## 3 F 3 197 0.224 16 0.133 0.0905
## 4 M 4 244 0.277 59 0.492 -0.214
# Check with visualisations
check_df$ID <- seq.int(nrow(check_df))
check_df
## # A tibble: 4 × 8
## # Groups: gender, demographic [4]
## gender demographic n_a p_a n_b p_b diff ID
## <fct> <int> <int> <dbl> <int> <dbl> <dbl> <int>
## 1 F 1 203 0.231 13 0.108 0.122 1
## 2 M 2 236 0.268 32 0.267 0.00152 2
## 3 F 3 197 0.224 16 0.133 0.0905 3
## 4 M 4 244 0.277 59 0.492 -0.214 4
plot_df <- check_df %>% pivot_longer(c(p_a, p_b), "group", "probability")
plot_df
## # A tibble: 8 × 8
## # Groups: gender, demographic [4]
## gender demographic n_a n_b diff ID group value
## <fct> <int> <int> <int> <dbl> <int> <chr> <dbl>
## 1 F 1 203 13 0.122 1 p_a 0.231
## 2 F 1 203 13 0.122 1 p_b 0.108
## 3 M 2 236 32 0.00152 2 p_a 0.268
## 4 M 2 236 32 0.00152 2 p_b 0.267
## 5 F 3 197 16 0.0905 3 p_a 0.224
## 6 F 3 197 16 0.0905 3 p_b 0.133
## 7 M 4 244 59 -0.214 4 p_a 0.277
## 8 M 4 244 59 -0.214 4 p_b 0.492
ggplot() + geom_col(aes(x=plot_df$ID, y=plot_df$value, fill=plot_df$group), position = "dodge") +
scale_fill_manual(values=c("#F87660", "#0038BA")) +
labs(title = "Group Balances", x = "Demographic", y = "Value", fill = "Group")
From both the Q-Q plot and the Group Balances chart above, it can be
concluded that there is considerable bias in the sampling that “Why Not
Watch?” has implemented. This would mean that no hypothesis testing is
applicable to the whole dataset as the data on hand is not an accurate
representation of the population that we are seeking insights for.
Since the sample dataset is biased we cannot examine the effect on the entirety of the data. However, hypothesis testing can be conducted within each demographic group.
# Construct demo1 subset and inspect
demo1 <- df %>% filter(demographic == 1) %>% group_by(group) %>%
select(demographic, group, hours_watched) %>% summarise(mean = mean(hours_watched))
print(demo1)
## # A tibble: 2 × 2
## group mean
## <fct> <dbl>
## 1 A 5.02
## 2 B 5.75
# Construct demo2 subset and inspect
demo2 <- df %>% filter(demographic == 2) %>% group_by(group) %>%
select(demographic, group, hours_watched) %>% summarise(mean = mean(hours_watched))
print(demo2)
## # A tibble: 2 × 2
## group mean
## <fct> <dbl>
## 1 A 5.01
## 2 B 5.71
# Construct demo3 subset and inspect
demo3 <- df %>% filter(demographic == 3) %>% group_by(group) %>%
select(demographic, group, hours_watched) %>% summarise(mean = mean(hours_watched))
print(demo3)
## # A tibble: 2 × 2
## group mean
## <fct> <dbl>
## 1 A 3.58
## 2 B 4.22
# Construct demo4 subset and inspect
demo4 <- df %>% filter(demographic == 4) %>% group_by(group) %>%
select(demographic, group, hours_watched) %>% summarise(mean = mean(hours_watched))
print(demo4)
## # A tibble: 2 × 2
## group mean
## <fct> <dbl>
## 1 A 3.72
## 2 B 4.28
# Combine subsets
demo1$demographic <- c(1,1)
demo2$demographic <- c(2,2)
demo3$demographic <- c(3,3)
demo4$demographic <- c(4,4)
demo_tot <- bind_rows(demo1, demo2, demo3, demo4)
ggplot() + geom_col(aes(x = demo_tot$demographic, y = demo_tot$mean, fill = demo_tot$group), position = "dodge") +
scale_fill_manual(values=c("#F87660", "#0038BA")) +
labs(title = "Effect visualisation", x = "Demographic", y = "Mean hours watched", fill = "Group")
From this AB test we can conclude that even though sampling was biased, diving deeper into demographic categories within the data reveals that WNW’s new recommendation engine algorithm has a positive impact in terms of average hours watched.
Neversaint 2013, How to change legend title in ggplot, Stack Overflow, viewed 25 February 2022, https://stackoverflow.com/questions/14622421/how-to-change-legend-title-in-ggplot.