library(ggplot2)    # for data visualisation 
library(dplyr)      # for data wrangling 
library(readr)      # for reading data 
library(here)       # for file paths 
library(magrittr)   # for pipes 
#library("reshape2") # Load reshape2
library("psych")    # for scatter matrix

Load Streaming data

stream <- "streaming_data.csv"

#col_names <- c('gender', 'age', 'social_metric', 'time_since_signup', #'demographic', 'group', 'hours_watched')

stream_df <- read.csv(here(stream), header = TRUE)

stream_df$gender <- gsub("F", "0", stream_df$gender) #change gender F to 0

stream_df$gender <- gsub("M", "1", stream_df$gender) #change gender M to 1
#streamM <- stream_df$gender

stream_df$gender <- as.numeric(stream_df$gender) #convert gender from character to numeric

streamF <- filter(stream_df, stream_df$gender == 0)  # select only females
streamM <- filter(stream_df, stream_df$gender == 1) # select only males

summary(stream_df)      # Inspect the dataset for possible errors
sum(is.na(stream_df))   # Check the dataset to make sure that there are no missing values
mean(is.na(stream_df))  # Check the dataset to make sure that there are no missing values

# From the above checks, it is confirmed that the dataset is clean.

A1 <- filter(stream_df, stream_df$group == "A") #Select group A from start till end
A <- subset(A1, select = -c(date, group)) #Exclude date and group

Scatter matrix and correlation

pairs.panels(A[,-50], 
             method = "pearson", # correlation method
             hist.col = "#00AFBB",
             density = TRUE,  # show density plots
             ellipses = TRUE # show correlation ellipses
             )

From the scatter matrix graph and the correlation numbers, the correlation between age and demographic is 0.76. Only one of them will be included in the multiple regression (MR) model to reduce the multicollinearity problem. Since the hours_watched (hrw) will be used as the dependent variable, and the correlation between hrw and age is -0.59, higher than that between hrw and demographic of -0.45. age is chosen as the preferred independent variable, together with social_metric and demographic. The other factors are not included due to their low correlation (less than 0.03) with hrw.

It is also observed that only the hours_watched displays a normal distribution.

 
# MLR of all A
 

```r
MLRA <- lm(hours_watched ~ age + social_metric, data = A) 

summary(MLRA)

# The ANOVA for the MLRA shows that the p values for age and social_metric are significant at less than 1%, and the two variables will be retained in the multiple regression model.

#  The beta coefficient of the intercept suggests that the hour_watched is at least 6.54 hours when the independent variables are zero. This could come from factors not captured by the MR.  e.g. During covid lockdown, people generally spend more time streaming as they could not go to work and participate in other activities. 

# The beta coefficient for age is -0.07, this is consistent with the scatter plot above where a weak negative relationship between hours_watched and age are also observed.  This means that older people do not stream as often as the younger people.

# Th beta coefficient for social_metric is 0.085.  It is observed that the relationship between hours_watched and social-metric is not very clear as the data are spread widely in the scatter plot, this can be explained by the nature of the data as they are expressed as categorical values between 0 and 10.  However, the weak and positive relationship is significant and confirmed in the MR.

Checking assumptions of Linear Models

# 1. Linearity-The scatter plot shows that there is a linear relationship between hrw and age.  Hence satisfying the linearity assumption.

# 2.  Constant variance

# Create a linear model hrw vs age

y <- A[, 6] # Select hours_watched as y
x <- A[, 2] # Select age as x

fit1 = lm(y ~ x)

# contains coeff [1]=intercept, [2]=slope; find with coef(fit)[1]

Plot the fit over the data

# setup x variable with range of interest

a0 <- coef(fit1)[1]
a1 <- coef(fit1)[2]

xfit1 <- seq(min(x), max(x), 1)

# calculate the fitting function yfit based on the coefficients from the model
yfit1 <- a0 + a1 * xfit1

Plot hours_watched vs age

gg <- ggplot()
gg <- gg + geom_point(aes(x = x, y = y, colour ='hours_watched'))
gg <- gg + geom_line(aes(x = xfit1, y = yfit1), colour = 'black')
gg <- gg + labs(x = 'age', y = 'hours_watched')
gg

# The hours_watched vs age gragh of the actual observations and the fitted line (fit1) shows that the errors are evenly distributed around the fitted line. Thus the assumption of constant variance is satisfied.

check normal distribution of the error term

# calculate the residual

Af1 <- a0 + a1 * A$age  #Af1 is the forecast hours_watched based on the original age observations

# calculate the residual
A$e1 <- A$hours_watched - Af1

gg <- ggplot()
gg <- gg + geom_point(aes(x, y = A$e1))
gg <- gg + labs(x = 'x', y = 'residual')
gg

qqnorm(A$e1, 
        main = 'QQ Plot for residual1 of hours_watched (Actual minus Expected)', 
        xlab = 'Theoretical Quantiles', ylab = 'Residual1', col="violet")
qqline(A$e1)

# The QQPlot of the residual of hours_watched (Actual minus Expected) shows that the residual is normally distributed as per assumption.

check historgram of residual

gg <- ggplot()
gg <- gg + geom_histogram(aes(x = A$e1), bins = 10, colour = "black") 
gg <- gg + labs(title = "Residual1 of hours_watched (Actual minus Expected)", x = "Residual1") 
gg

# The histogram confirms that the residual of hours_watched (Actual minus Expected)is normally distributed.

#  The independence assumption is satisfied when the high correlation (0.76) between age and demographic was recognised in the scatter matrix and age was chosen in the MR and the mulitcollinearity issue is reduced.

Checking the assumptions for hours_watched vs social_metric

# 1. Linearity-The scatter plot shows that there is a linear relationship between hrw and social_metric. Hence satisfying the linearity assumption.


# 2.  Constant variance

# Create a linear model hrw vs social_netric

y2 <- A[, 6] # Select hours_watched as y
x2 <- A[, 3] # Select social_watched as x2

fit2 = lm(y2 ~ x2)

# contains coeff [1]=intercept, [2]=slope; find with coef(fit)[1]

summary(fit2)

Plot the fit over the data (fit2)

# setup x variable with range of interest

xfit2 <- seq(min(x2), max(x2), 1)

b0 <- coef(fit2)[1]
b1 <- coef(fit2)[2]

# calculate the fitting function yfit based on the coefficients from the model
yfit2 <- b0 + b1 * xfit2
gg <- ggplot()
gg <- gg + geom_point(aes(x = x2, y = y2, colour='hours_watched'))
gg <- gg + geom_line(aes(x = xfit2, y = yfit2), colour = 'black')
gg <- gg + labs(x = 'social_Metric', y = 'hours_watched')
gg

# The hours_watched vs social_metric graph of the actual observations and the fitted line (fit1) shows that the errors are evenly distributed around the fitted line. Thus the assumption of constant variance is satisfied.

check normal distribution of the error term

# calculate the residual

Af2 <- b0 + b1 * A$social_metric  #Af1 is the forecast hours_watched based on the original age observations

# calculate the residual
A$e2 <- A$hours_watched - Af2

gg <- ggplot()
gg <- gg + geom_point(aes(x, y = A$e2))
gg <- gg + labs(x = 'x', y = 'residual')
gg

qqnorm(A$e2, 
        main = 'QQ Plot for residual2 of hours_watched (Actual minus Expected)', 
        xlab = 'Theoretical Quantiles', ylab = 'Residual', col="violet")
qqline(A$e2)

# The QQPlot of the residual of hours_watched (Actual minus Expected) shows that the residual is normally distributed as per assumption.

check historgram of residual

gg <- ggplot()
gg <- gg + geom_histogram(aes(x = A$e2), bins = 10, colour = "black") 
gg <- gg + labs(title = "Residual2 of hours_watched (Actual minus Expected)", x = "Residual2") 
gg

# The histogram confirms that the residual of hours_watched (Actual minus Expected)is normally distributed.

#  The independence assumption is satisfied when the high correlation (0.76) between age and demographic was recognised in the scatter matrix and age was chosen in the MR and the mulitcollinearity issue is reduced.

Check for Bias

# Find the mean and sd of hours_watched for the whole population (the whole dataset)

Pophrwmean <- mean(stream_df$hours_watched)
Pophrwsd <- sd(stream_df$hours_watched)

# Find the margin of error of hrw between A and B from 18/7

M1 <- stream_df[1:548,] #Select rows from start till 17 July
post177 <- stream_df[549:1000,] #Select rows from 18 July till end

M2 <- filter(post177, post177$group == "A") #Select group A from 18 July till end

M3 <- filter(post177, post177$group == "B") #Select group B from 18 July till end

M1hrw <- M1$hours_watched

M2hrw <- M2$hours_watched

M3hrw <- M3$hours_watched

Margin <- mean(M3$hours_watched) - mean(M2$hours_watched) # Margin of error based on A and B after 17/7.

Minimum Sample size

z_alpha <- 1.96 #95% confidence level
effect_est <- Margin   #Margin of error is 0.41
sd_est <- Pophrwsd # Population sd is 1.33
n_ss <- ceiling((z_alpha * sd_est / effect_est)^2)
print(paste('Min sample size', n_ss))

# minimum sample size is 41
## The dataset was checked previously and it was confirmed that it is clean and ready for use.

## Check the streaming behaviour between older people and younger people for the whole dataset

Streaming behaviour of old vs young

stream_df <- read.csv(here(stream), header = TRUE)

p1<- ggplot(stream_df, aes(x=age, y=hours_watched)) +
  geom_point(size=2, shape=23)

p1

# The plot shows that older people has lower hours_watched.

Check for in the streaming hehaviour between Female and Male

p2<- ggplot(stream_df, aes(x=gender, y=hours_watched)) +
  geom_point(size=2, shape=23)

p2

Fmeanhrw <- mean(streamF$hours_watched)  #Average hrw of females (0)
Mmeanhrw <- mean(streamM$hours_watched)  #Average hrw of males (1)

# Average hrw watched by female is 4.35 hours
# Average hrw watched by male is 4.42 hours
# On average, male spend slighly more time (4.2 min) than female, and the plot shows that there are more males streaming between 7 to 8 hrw than females.

Check age distribution

stream_df2 <- read.csv(here(stream), header = TRUE)

p4<-ggplot(data=stream_df2, aes(x=age, y=hours_watched)) +
  geom_bar(stat="identity")
p4

# The hours_watched for ages 20, 23 and 28 exceeds 175 hours. 
# Age would be a logical divider in terms of demographic groups of interest in later analysis.

# Different marketing strategies would be needed based on age and gender. 

gg <- ggplot(stream_df2, aes(x = age, fill = gender))
gg <- gg + geom_histogram(alpha = 0.5, position = "identity", binwidth = 1)
gg

cond <- stream_df2$gender == 'M'
sorted_M <- stream_df[cond,] %>% arrange(age)
sorted_M$number <- 1
sorted_M$cumsum <- cumsum(sorted_M$number)

cond <- stream_df2$gender == 'F'
sorted_F <- stream_df2[cond,] %>% arrange(age)
sorted_F$number <- 1
sorted_F$cumsum <- cumsum(sorted_F$number)

Clusters

int_breaks_rounded <- function(x, n = 10) pretty(x, n)[round(pretty(x, n),1) %% 1 == 0]

gg <- ggplot()
gg <- gg + geom_point(aes(x = stream_df2$age,
                          y = stream_df2$hours_watched))

gg <- gg + scale_y_continuous(breaks = int_breaks_rounded )

gg <- gg + labs(x = 'Age (years)', y = 'hours_watched')
gg

#  The plot shows that for ages below 35 many of the hours_watched exceeds 7 hours.

# A clear split in this data is when the hours_watched by 7. 

# A midway age split for the cluster above the 7-hour line is about 28, while for # # the lower cluster it is about 45. 
age_cut_above <- 28
age_cut_below <- 45

# group above/below the line
stream_df2$above <- stream_df2$hours_watched > 7

# create young/old groups with different splits depending on the above/below group

# required percentage to be set aside for the control group
# summary_df$pc_needed <- n_ss / summary_df$n 

# max required percentage
#pc_needed <- max(summary_df$pc_needed)
#print(paste('pc_needed:', pc_needed))
# count the numbers in each demographic category based on the A/B group created by WNW.

M2A <- filter(post177, post177$group == "A") #Select group A from 18 July till end
# Calculate percentage (pc) of demographic (demo) distribution in A

# separate demographic into 1 to 4

M2B <- filter(post177, post177$group == "B")

AM2demo1 <- filter(M2A, M2A$demographic == 1) # contains demographic = 1
AM2demo2 <- filter(M2A, M2A$demographic == 2) # contains demographic = 2
AM2demo3 <- filter(M2A, M2A$demographic == 3) # contains demographic = 3
AM2demo4 <- filter(M2A, M2A$demographic == 4) # contains demographic = 4
AM2demo  <- filter(M2A, M2A$demographic <= 5) # contains demographic = 1 to 4

Ademo1Count <- AM2demo1%>% count(demographic) # Count number of demographic = 1
Ademo2Count <- AM2demo2%>% count(demographic) # Count number of demographic = 2
Ademo3Count <- AM2demo3%>% count(demographic) # Count number of demographic = 3
Ademo4Count <- AM2demo4%>% count(demographic) # Count number of demographic = 4
AdemoCount <- AM2demo%>% count(demographic) # Count number of all demographic in A

Ademo1C <- sum(Ademo1Count[,2]) # Extract number of demo = 1
Ademo2C <- sum(Ademo2Count[,2])
Ademo3C <- sum(Ademo3Count[,2])
Ademo4C <- sum(Ademo4Count[,2])
AdemoC <- sum(AdemoCount[,2])

ATot_demo <- AdemoC

Ademo1 <- round((Ademo1C/ATot_demo) * 100, digits= 2)
Ademo2 <- round((Ademo2C/ATot_demo) * 100, digits= 2)
Ademo3 <- round((Ademo3C/ATot_demo) * 100, digits= 2)
Ademo4 <- round((Ademo4C/ATot_demo) * 100, digits= 2)

print(paste0("Ademographic 1 % = ", Ademo1, "  Ademographic 2 % = ", Ademo2, "  Ademographic 3 % = ", Ademo3, "  Ademographic 4 % = ", Ademo4))
# Calculate percentage (pc) of demographic (demo) distribution in B

# separate demographic into 1 to 4

BM2demo1 <- filter(M2B, M2B$demographic == 1) # contains demographic = 1
BM2demo2 <- filter(M2B, M2B$demographic == 2) # contains demographic = 2
BM2demo3 <- filter(M2B, M2B$demographic == 3) # contains demographic = 3
BM2demo4 <- filter(M2B, M2B$demographic == 4) # contains demographic = 4
BM2demo  <- filter(M2B, M2B$demographic <= 5) # contains demographic = 1 to 4

Bdemo1Count <- BM2demo1%>% count(demographic) # Count number of demographic = 1
Bdemo2Count <- BM2demo2%>% count(demographic) # Count number of demographic = 2
Bdemo3Count <- BM2demo3%>% count(demographic) # Count number of demographic = 3
Bdemo4Count <- BM2demo4%>% count(demographic) # Count number of demographic = 4
BdemoCount <- BM2demo%>% count(demographic) # Count number of all demographic in A

Bdemo1C <- sum(Bdemo1Count[,2]) # Extract number of demo = 1
Bdemo2C <- sum(Bdemo2Count[,2]) # Extract number of demo = 2
Bdemo3C <- sum(Bdemo3Count[,2]) # Extract number of demo = 3
Bdemo4C <- sum(Bdemo4Count[,2]) # Extract number of demo = 4
BdemoC <- sum(BdemoCount[,2]) # Extract number of all demographic in B

BTot_demo <- BdemoC

Bdemo1 <- round((Bdemo1C/BTot_demo) * 100, digits= 2)
Bdemo2 <- round((Bdemo2C/BTot_demo) * 100, digits= 2)
Bdemo3 <- round((Bdemo3C/BTot_demo) * 100, digits= 2)
Bdemo4 <- round((Bdemo4C/BTot_demo) * 100, digits= 2)

print(paste0("Bdemographic 1 % = ", Bdemo1, "  Bdemographic 2 % = ", Bdemo2, "  Bdemographic 3 % = ", Bdemo3, "  Bdemographic 4 % = ", Bdemo4))
## Check group balances

# WNW setup the A/B groups, double check that the A/B group represent the total population.

The hypothesis tests will be done on each subgroup as groups A and B are biased samples that do not represent the whole population.```

# Check sd for each subgroup to see if they are equal or unequal  between A and B. If unequal use Welsh test.

demo1diff <- sd(AM2demo1$hours_watched) - sd(BM2demo1$hours_watched)
demo2diff <- sd(AM2demo2$hours_watched) - sd(BM2demo2$hours_watched)
demo3diff <- sd(AM2demo3$hours_watched) - sd(BM2demo3$hours_watched)
demo4diff <- sd(AM2demo4$hours_watched) - sd(BM2demo4$hours_watched)

demo1diff
demo2diff
demo3diff
demo4diff

#Null Hypothesis: The means of A and B are the same
#Alternative Hypothesis: The means of A and B are not the same

t.test(AM2demo1$hours_watched, BM2demo1$hours_watched, alternative = "two.sided", var.equal = FALSE) # demographic = 1

t.test(AM2demo2$hours_watched, BM2demo2$hours_watched, alternative = "two.sided", var.equal = FALSE) # demographic = 2

t.test(AM2demo3$hours_watched, BM2demo3$hours_watched, alternative = "two.sided", var.equal = FALSE) # demographic = 3

t.test(AM2demo4$hours_watched, BM2demo4$hours_watched, alternative = "two.sided", var.equal = FALSE) # demographic = 4

effect=difference of mean hours_watched from 18/7

Ademo1<- mean(AM2demo1$hours_watched)
Ademo2<- mean(AM2demo2$hours_watched)
Ademo3<- mean(AM2demo3$hours_watched)
Ademo4<- mean(AM2demo4$hours_watched)

Bdemo1<- mean(BM2demo1$hours_watched)
Bdemo2<- mean(BM2demo2$hours_watched)
Bdemo3<- mean(BM2demo3$hours_watched)
Bdemo4<- mean(BM2demo4$hours_watched)

# Effect is defined as mean of hrw B - mean of hrw A

demo1E1 <- Bdemo1 - Ademo1 #Effect of demo 1
demo2E2 <- Bdemo2 - Ademo2 #Effect of demo 2
demo3E3 <- Bdemo3 - Ademo3 #Effect of demo 3
demo4E4 <- Bdemo4 - Ademo4 #Effect of demo 4

Name <- c("demo1", "demo2", "demo3", "demo4")
Effect <- c(demo1E1, demo2E2, demo3E3, demo4E4)

df <- data.frame(Name, Effect)

print (df)
# count the proportion of demographic group based on the complete dataset from 1/7 to 31/7.

check_w_df <- stream_df %>% 
select(demographic) %>% 
  group_by(demographic) %>% 
mutate(n_w = n()) %>% 
distinct()

Wn_total1 <- count(stream_df, "demographic")

Wn_total <- Wn_total1[,2]

# proportions in each demographic
check_w_df$p_w <- check_w_df$n_w / Wn_total

df2 <- arrange(check_w_df, demographic) #sort demographic in order

Effect <- c(demo1E1, demo2E2, demo3E3, demo4E4)
Prop <- df2$p_w

WAFinalE <- round(sum(Effect * Prop), digits=4) # Weighted Average final effect

print(paste('WA final effect of hours_watched is', WAFinalE))

#The Weighted Average of the final effect of hours_watched due to the new recommendation engine is estimated to be 0.5975 hours (about 36 min)