library(ggplot2) # for data visualisation
library(dplyr) # for data wrangling
library(readr) # for reading data
library(here) # for file paths
library(magrittr) # for pipes
#library("ggpubr") # for creating and customizing 'ggplot2'- based #publication ready plots.
#theme_set(theme_pubr())
library("reshape2") # Load reshape2
#library("scatterplot3d") # load
#library("plot3D")
library("psych")
stream <- "streaming_data.csv"
The following chunk will load the pre-cleaned Stream data.
# load stream dataset
```r
# read Streaming data
col_names <- c('gender', 'age', 'social_metric', 'time_since_signup', 'demographic', 'group', 'hours_watched')
stream_df <- read.csv(here(stream), header = TRUE)
stream_df$gender <- gsub("F", "0", stream_df$gender) #change gender F to 0
stream_df$gender <- gsub("M", "1", stream_df$gender) #change gender M to 1
streamM <- stream_df$gender
stream_df$gender <- as.numeric(stream_df$gender) #convert gender from character to numeric
streamF <- filter(stream_df, stream_df$gender == 0) # select only females
streamM <- filter(stream_df, stream_df$gender == 1) # select only males
str(stream_df)
summary(stream_df) # Inspect the dataset for possible errors
sum(is.na(stream_df)) # Check the dataset to make sure that there are no missing values
mean(is.na(stream_df)) # Check the dataset to make sure that there are no missing values
# From the checks, it can be confirmed that the dataset is clean.
A1 <- filter(stream_df, stream_df$group == "A") #Select group A from start till end
A <- subset(A1, select = -c(date, group)) #Exclude date and group
str(A)
pairs.panels(A[,-50],
method = "pearson", # correlation method
hist.col = "#00AFBB",
density = TRUE, # show density plots
ellipses = TRUE # show correlation ellipses
)
# From the scatter matrix graph and the correlation numbers, the correlation between age and demographic is 0.76. Only one of them will be included in the multiple regression (MR) model to reduce the multicollinearity problem. Since the hours_watched (hrw) will be used as the dependent variable, and the correlation between hrw and age is -0.59, higher than that between hrw and demographic of -0.45. age is chosen as the preferred independent variable, together with social_metric and demographic. The other factors are not included due to their low correlation (less than 0.03) with hrw.
# It is also observed that only the hours_watched displays a normal distribution.
MLRA <- lm(hours_watched ~ age + social_metric, data = A)
summary(MLRA)
# The ANOVA for the MLRA shows that the p values for age and social_metric are significant at less than 1%, and the two variables will be retained in the multiple regression model.
# The beta coefficient of the intercept suggests that the hour_watched is at least 6.54 hours when the independent variables are zero. This could come from factors not captured by the MR. e.g. During covid lockdown, people generally spend more time streaming as they could not go to work and participate in other activities.
# The beta coefficient for age is -0.07, this is consistent with the scatter plot above where a weak negative relationship between hours_watched and age are also observed. This means that older people do not stream as often as the younger people.
# Th beta coefficient for social_metric is 0.085. It is observed that the relationship between hours_watched and social-metric is not very clear as the data are spread widely in the scatter plot, this can be explained by the nature of the data as they are expressed as categorical values between 0 and 10. However, the weak and positive relationship is significant and confirmed in the MR.
# 1. Linearity-The scatter plot shows that there is a linear relationship between hrw and age. Hence satisfying the linearfity assumption.
# 2. Constant variance
# Create a linear model hrw vs age
y <- A[, 6] # Select hours_watched as y
x <- A[, 2] # Select age as x
fit1 = lm(y ~ x)
# contains coeff [1]=intercept, [2]=slope; find with coef(fit)[1]
summary(fit1)
a0 <- coef(fit1)[1]
a1 <- coef(fit1)[2]
Plot the fit over the data
# setup x variable with range of interest
xfit1 <- seq(min(x), max(x), 1)
# calculate the fitting function yfit based on the coefficients from the model
yfit1 <- a0 + a1 * xfit1
gg <- ggplot() gg <- gg + geom_point(aes(x = x, y = y, colour =‘hours_watched’)) gg <- gg + geom_line(aes(x = xfit1, y = yfit1), colour = ‘black’) gg <- gg + labs(x = ‘age’, y = ‘hours_watched’) gg
# The hours_watched vs age gragh of the actual observations and the fitted line (fit1) shows that the errors are evenly distributed around the fitted line. Thus the assumption of constant variance is satisfied.
# calculate the residual
Af1 <- a0 + a1 * A$age #Af1 is the forecast hours_watched based on the original age observations
# calculate the residual
A$e1 <- A$hours_watched - Af1
gg <- ggplot()
gg <- gg + geom_point(aes(x, y = A$e1))
gg <- gg + labs(x = 'x', y = 'residual')
gg
qqnorm(A$e1,
main = 'QQ Plot for residual1 of hours_watched (Actual minus Expected)',
xlab = 'Theoretical Quantiles', ylab = 'Residual1', col="violet")
qqline(A$e1)
# The QQPlot of the residual of hours_watched (Actual minus Expected) shows that the residual is normally distributed as per assumption.
```{r-check historgram of residual}
gg <- ggplot()
gg <- gg + geom_histogram(aes(x = A$e1), bins = 10, colour = "black")
gg <- gg + labs(title = "Residual1 of hours_watched (Actual minus Expected)", x = "Residual1")
gg
```r
# 1. Linearity-The scatter plot shows that there is a linear relationship between hrw and social_metric. Hence satisfying the linearity assumption.
# 2. Constant variance
# Create a linear model hrw vs social_netric
y2 <- A[, 6] # Select hours_watched as y
x2 <- A[, 3] # Select social_watched as x2
fit2 = lm(y2 ~ x2)
# contains coeff [1]=intercept, [2]=slope; find with coef(fit)[1]
summary(fit2)
b0 <- coef(fit2)[1]
b1 <- coef(fit2)[2]
Plot the fit over the data
# setup x variable with range of interest
xfit2 <- seq(min(x2), max(x2), 1)
# calculate the fitting function yfit based on the coefficients from the model
yfit2 <- b0 + b1 * xfit2
gg <- ggplot() gg <- gg + geom_point(aes(x = x2, y = y2, colour=‘hours_watched’)) gg <- gg + geom_line(aes(x = xfit2, y = yfit2), colour = ‘black’) gg <- gg + labs(x = ‘social_Metric’, y = ‘hours_watched’) gg
# The hours_watched vs social_metric gragh of the actual observations and the fitted line (fit1) shows that the errors are evenly distributed around the fitted line. Thus the assumption of constant variance is satisfied.
# calculate the residual
Af2 <- b0 + b1 * A$social_metric #Af1 is the forecast hours_watched based on the original age observations
# calculate the residual
A$e2 <- A$hours_watched - Af2
gg <- ggplot()
gg <- gg + geom_point(aes(x, y = A$e2))
gg <- gg + labs(x = 'x', y = 'residual')
gg
qqnorm(A$e2,
main = 'QQ Plot for residual2 of hours_watched (Actual minus Expected)',
xlab = 'Theoretical Quantiles', ylab = 'Residual', col="violet")
qqline(A$e2)
# The QQPlot of the residual of hours_watched (Actual minus Expected) shows that the residual is normally distributed as per assumption.
```{r-check historgram of residual}
gg <- ggplot()
gg <- gg + geom_histogram(aes(x = A$e2), bins = 10, colour = "black")
gg <- gg + labs(title = "Residual2 of hours_watched (Actual minus Expected)", x = "Residual2")
gg
```r
# Find the mean and sd of hours_watched for the whole population (the whole dataset)
Pophrwmean <- mean(stream_df$hours_watched)
Pophrwsd <- sd(stream_df$hours_watched)
# Find the margin of error of hrw between A and B from 18/7
head(stream_df)
tail(stream_df)
M1 <- stream_df[1:548,] #Select rows from start till 17 July
post177 <- stream_df[549:1000,] #Select rows from 18 July till end
M2 <- filter(post177, post177$group == "A") #Select group A from 18 July till end
M3 <- filter(post177, post177$group == "B") #Select group B from 18 July till end
head(M1)
tail(M1)
mean(M1$hours_watched)
sd(M1$hours_watched)
M1hrw <- M1$hours_watched
head(M2)
tail(M2)
mean(M2$hours_watched)
sd(M2$hours_watched)
M2hrw <- M2$hours_watched
head(M3)
tail(M3)
mean(M3$hours_watched)
sd(M3$hours_watched)
M3hrw <- M3$hours_watched
Margin <- mean(M3$hours_watched) - mean(M2$hours_watched) # Margin of error based on A and B after 17/7.
z_alpha <- 1.96 #95% confidence level
effect_est <- Margin #Margin of error is 0.41
sd_est <- Pophrwsd # Population sd is 1.33
n_ss <- ceiling((z_alpha * sd_est / effect_est)^2)
print(paste('Min sample size', n_ss))
# minimum sample size is 41
## The dataset was checked previously and it was confirmed that it is clean and ready for use.
## Check the streaming behaviour between older people and younger people for the whole dataset
```r
stream_df <- read.csv(here(stream), header = TRUE)
p1<- ggplot(stream_df, aes(x=age, y=hours_watched)) +
geom_point(size=2, shape=23)
p1
# The plot shows that older people has lower hours_watched.
## Check for in the streaming hehaviour between Female and Male
stream_df$gender <- as.numeric(stream_df$gender) #remove at the end
streamF <- filter(stream_df, stream_df$gender == 0) # select only females, remove at the end
streamM <- filter(stream_df, stream_df$gender == 1) # select only males, remove at the end
p2<- ggplot(stream_df, aes(x=gender, y=hours_watched)) +
geom_point(size=2, shape=23)
p2
Fmeanhrw <- mean(streamF$hours_watched) #Average hrw of females (0)
Mmeanhrw <- mean(streamM$hours_watched) #Average hrw of males (1)
# Average hrw watched by female is 4.35 hours
# Average hrw watched by male is 4.42 hours
# On average, male spend slighly more time (4.2 min) than female, and the plot shows that there are more males streaming between 7 to 8 hrw than females.
stream_df2 <- read.csv(here(stream), header = TRUE)
p4<-ggplot(data=stream_df2, aes(x=age, y=hours_watched)) +
geom_bar(stat="identity")
p4
# The hours_watched for ages 20, 23 and 28 exceeds 175 hours.
# Age would be a logical divider in terms of demographic groups of interest in later analysis.
# Different marketing strategies would be needed based on age and gender.
gg <- ggplot(stream_df2, aes(x = age, fill = gender))
gg <- gg + geom_histogram(alpha = 0.5, position = "identity", binwidth = 1)
gg
cond <- stream_df2$gender == 'M'
sorted_M <- stream_df[cond,] %>% arrange(age)
sorted_M$number <- 1
sorted_M$cumsum <- cumsum(sorted_M$number)
cond <- stream_df2$gender == 'F'
sorted_F <- stream_df2[cond,] %>% arrange(age)
sorted_F$number <- 1
sorted_F$cumsum <- cumsum(sorted_F$number)
Smaller percentages increase the size of the treatment group, but decrease the potential number of groups to investigate later.
As an example a 10% allocation to the control group is shown below.
# set control group percentage
percentage_cg <- 0.1
# number required in each group
del_n <- n_ss / percentage_cg
# plot break points
gg <- ggplot()
gg <- gg + geom_line(aes(x = sorted_M$age, y = sorted_M$cumsum),
colour = 'dark blue')
gg <- gg + geom_line(aes(x = sorted_F$age, y = sorted_F$cumsum),
colour = 'dark red')
gg <- gg + geom_hline(yintercept = seq(del_n, 543, del_n))
gg
# smallest number is with females
n_groups_f <- floor(max(sorted_F$cumsum) / del_n)
print(paste('Number of groups for females:', n_groups_f))
##Can the data be clustered?
# To check for patterns in the data by looking at the effect of hours_watched
int_breaks_rounded <- function(x, n = 10) pretty(x, n)[round(pretty(x, n),1) %% 1 == 0]
gg <- ggplot()
gg <- gg + geom_point(aes(x = stream_df2$age,
y = stream_df2$hours_watched))
gg <- gg + scale_y_continuous(breaks = int_breaks_rounded )
gg <- gg + labs(x = 'Age (years)', y = 'hours_watched')
gg
# The plot shows that for ages below 35 many of the hours_watched exceeds 7 hours.
A clear split in this data is to when the hours_watched by 7.
A midway age split for the cluster above the 7-hour line is about 28, while for the lower cluster it is about 45.
age_cut_above <- 28
age_cut_below <- 45
# group above/below the line
stream_df2$above <- stream_df2$hours_watched > 7
# create young/old groups with different splits depending on the above/below group
# required percentage to be set aside for the control group
# summary_df$pc_needed <- n_ss / summary_df$n
# max required percentage
#pc_needed <- max(summary_df$pc_needed)
#print(paste('pc_needed:', pc_needed))
# count the numbers in each demographic category based on the A/B group created by WNW.
M2A <- filter(post177, post177$group == "A") #Select group A from 18 July till end
# Calculate percentage (pc) of demographic (demo) distribution in A
# separate demographic into 1 to 4
M2A <- filter(post177, post177$group == "A") #remove later
AM2demo1 <- filter(M2A, M2A$demographic == 1) # contains demographic = 1
AM2demo2 <- filter(M2A, M2A$demographic == 2) # contains demographic = 2
AM2demo3 <- filter(M2A, M2A$demographic == 3) # contains demographic = 3
AM2demo4 <- filter(M2A, M2A$demographic == 4) # contains demographic = 4
AM2demo <- filter(M2A, M2A$demographic <= 5) # contains demographic = 1 to 4
Ademo1Count <- AM2demo1%>% count(demographic) # Count number of demographic = 1
Ademo2Count <- AM2demo2%>% count(demographic) # Count number of demographic = 2
Ademo3Count <- AM2demo3%>% count(demographic) # Count number of demographic = 3
Ademo4Count <- AM2demo4%>% count(demographic) # Count number of demographic = 4
AdemoCount <- AM2demo%>% count(demographic) # Count number of all demographic in A
Ademo1C <- sum(Ademo1Count[,2]) # Extract number of demo = 1
Ademo2C <- sum(Ademo2Count[,2])
Ademo3C <- sum(Ademo3Count[,2])
Ademo4C <- sum(Ademo4Count[,2])
AdemoC <- sum(AdemoCount[,2])
ATot_demo <- AdemoC
Ademo1 <- round((Ademo1C/ATot_demo) * 100, digits= 2)
Ademo2 <- round((Ademo2C/ATot_demo) * 100, digits= 2)
Ademo3 <- round((Ademo3C/ATot_demo) * 100, digits= 2)
Ademo4 <- round((Ademo4C/ATot_demo) * 100, digits= 2)
print(paste0("Ademographic 1 % = ", Ademo1, " Ademographic 2 % = ", Ademo2, " Ademographic 3 % = ", Ademo3, " Ademographic 4 % = ", Ademo4))
# Calculate percentage (pc) of demographic (demo) distribution in B
# separate demographic into 1 to 4
M2B <- filter(post177, post177$group == "B") #remove later
BM2demo1 <- filter(M2B, M2B$demographic == 1) # contains demographic = 1
BM2demo2 <- filter(M2B, M2B$demographic == 2) # contains demographic = 2
BM2demo3 <- filter(M2B, M2B$demographic == 3) # contains demographic = 3
BM2demo4 <- filter(M2B, M2B$demographic == 4) # contains demographic = 4
BM2demo <- filter(M2B, M2B$demographic <= 5) # contains demographic = 1 to 4
Bdemo1Count <- BM2demo1%>% count(demographic) # Count number of demographic = 1
Bdemo2Count <- BM2demo2%>% count(demographic) # Count number of demographic = 2
Bdemo3Count <- BM2demo3%>% count(demographic) # Count number of demographic = 3
Bdemo4Count <- BM2demo4%>% count(demographic) # Count number of demographic = 4
BdemoCount <- BM2demo%>% count(demographic) # Count number of all demographic in A
Bdemo1C <- sum(Bdemo1Count[,2]) # Extract number of demo = 1
Bdemo2C <- sum(Bdemo2Count[,2]) # Extract number of demo = 2
Bdemo3C <- sum(Bdemo3Count[,2]) # Extract number of demo = 3
Bdemo4C <- sum(Bdemo4Count[,2]) # Extract number of demo = 4
BdemoC <- sum(BdemoCount[,2]) # Extract number of all demographic in B
BTot_demo <- BdemoC
Bdemo1 <- round((Bdemo1C/BTot_demo) * 100, digits= 2)
Bdemo2 <- round((Bdemo2C/BTot_demo) * 100, digits= 2)
Bdemo3 <- round((Bdemo3C/BTot_demo) * 100, digits= 2)
Bdemo4 <- round((Bdemo4C/BTot_demo) * 100, digits= 2)
print(paste0("Bdemographic 1 % = ", Bdemo1, " Bdemographic 2 % = ", Bdemo2, " Bdemographic 3 % = ", Bdemo3, " Bdemographic 4 % = ", Bdemo4))
## Check group balances
# WNW setup the A/B groups, double check that the A/B group represent the total population.
The hypothesis tests will be done on each subgroup as groups A and B are biased samples that do not represent the whole population.```
# Check sd for each subgroup to see if they are equal or unequal between A and B. If unequal use Welsh test.
demo1diff <- sd(AM2demo1$hours_watched) - sd(BM2demo1$hours_watched)
demo2diff <- sd(AM2demo2$hours_watched) - sd(BM2demo2$hours_watched)
demo3diff <- sd(AM2demo3$hours_watched) - sd(BM2demo3$hours_watched)
demo4diff <- sd(AM2demo4$hours_watched) - sd(BM2demo4$hours_watched)
demo1diff
demo2diff
demo3diff
demo4diff
#Null Hypothesis: The means of A and B are the same
#Alternative Hypothesis: The means of A and B are not the same
t.test(AM2demo1$hours_watched, BM2demo1$hours_watched, alternative = "two.sided", var.equal = FALSE) # demographic = 1
t.test(AM2demo2$hours_watched, BM2demo2$hours_watched, alternative = "two.sided", var.equal = FALSE) # demographic = 2
t.test(AM2demo3$hours_watched, BM2demo3$hours_watched, alternative = "two.sided", var.equal = FALSE) # demographic = 3
t.test(AM2demo4$hours_watched, BM2demo4$hours_watched, alternative = "two.sided", var.equal = FALSE) # demographic = 4
Ademo1<- mean(AM2demo1$hours_watched)
Ademo2<- mean(AM2demo2$hours_watched)
Ademo3<- mean(AM2demo3$hours_watched)
Ademo4<- mean(AM2demo4$hours_watched)
Bdemo1<- mean(BM2demo1$hours_watched)
Bdemo2<- mean(BM2demo2$hours_watched)
Bdemo3<- mean(BM2demo3$hours_watched)
Bdemo4<- mean(BM2demo4$hours_watched)
# Effect is defined as mean of hrw B - mean of hrw A
demo1E1 <- Bdemo1 - Ademo1 #Effect of demo 1
demo2E2 <- Bdemo2 - Ademo2 #Effect of demo 2
demo3E3 <- Bdemo3 - Ademo3 #Effect of demo 3
demo4E4 <- Bdemo4 - Ademo4 #Effect of demo 4
Name <- c("demo1", "demo2", "demo3", "demo4")
Effect <- c(demo1E1, demo2E2, demo3E3, demo4E4)
df <- data.frame(Name, Effect)
print (df)
# count the proportion of demographic group based on the complete dataset from 1/7 to 31/7.
check_w_df <- stream_df %>%
select(demographic) %>%
group_by(demographic) %>%
mutate(n_w = n()) %>%
distinct()
Wn_total1 <- count(stream_df, "demographic")
Wn_total <- Wn_total1[,2]
# proportions in each demographic
check_w_df$p_w <- check_w_df$n_w / Wn_total
df2 <- arrange(check_w_df, demographic) #sort demographic in order
Effect <- c(demo1E1, demo2E2, demo3E3, demo4E4)
Prop <- df2$p_w
WAFinalE <- round(sum(Effect * Prop), digits=4) # Weighted Average final effect
print(paste('WA final effect of hours_watched is', WAFinalE))
#The Weighted Average of the final effect of hours_watched due to the new recommendation engine is estimated to 0.5975 hours (about 36 min)