title: “WNW assessment of Streaming algorithm change on watching volume” author: “Nathaniel Mitchell 3975239” date: ‘Last updated: 23/02/2023’ output: ioslides_presentation: highlight: haddock beamer_presentation: highlight: haddock slidy_presentation: font_adjustment: 0 highlight: haddock mathjax: default df_print: paged subtitle: alternative recommendation engine outcomes versus older version editor_options: markdown: wrap: sentence

Introduction

setwd("C:/Users/Nathaniel Mitchell/Desktop/Data Science/advanced statistics R/assessments/assessment 3")
data <- read.csv("streaming_data.csv")
head(data)
##    date gender age social_metric time_since_signup demographic group
## 1 1-Jul      F  28             5              19.3           1     A
## 2 1-Jul      F  32             7              11.5           1     A
## 3 1-Jul      F  39             4               4.3           3     A
## 4 1-Jul      M  52            10               9.5           4     A
## 5 1-Jul      M  25             1              19.5           2     A
## 6 1-Jul      M  51             0              22.6           4     A
##   hours_watched
## 1          4.08
## 2          2.99
## 3          5.74
## 4          4.13
## 5          4.68
## 6          3.40

Problem Statement

Data

Analysis

# This is a primary full data summary to look for missing values, means, and general shapes for variables
summary(data)
##      date              gender               age        social_metric   
##  Length:1000        Length:1000        Min.   :18.00   Min.   : 0.000  
##  Class :character   Class :character   1st Qu.:28.00   1st Qu.: 2.000  
##  Mode  :character   Mode  :character   Median :36.00   Median : 5.000  
##                                        Mean   :36.49   Mean   : 4.911  
##                                        3rd Qu.:46.00   3rd Qu.: 8.000  
##                                        Max.   :55.00   Max.   :10.000  
##  time_since_signup  demographic       group           hours_watched  
##  Min.   : 0.00     Min.   :1.000   Length:1000        Min.   :0.500  
##  1st Qu.: 5.70     1st Qu.:2.000   Class :character   1st Qu.:3.530  
##  Median :11.80     Median :3.000   Mode  :character   Median :4.415  
##  Mean   :11.97     Mean   :2.603                      Mean   :4.393  
##  3rd Qu.:18.70     3rd Qu.:4.000                      3rd Qu.:5.322  
##  Max.   :24.00     Max.   :4.000                      Max.   :8.300
ggplot(data, aes(x = age)) +
  geom_density(fill = "green", alpha = 0.6) +
  xlab("Age") +
  ylab("Density") +
  ggtitle("Density Plot of Age")

hist(data$social_metric, col = "orange")

hist(data$time_since_signup, col = "purple")

hist(data$demographic, col = "red")

ggplot(data, aes(x = hours_watched)) +
  geom_density(fill = "skyblue", alpha = 0.6) +
  xlab("Hours Watched") +
  ylab("Density") +
  ggtitle("Density Plot of Hours Watched")

# Create bins for time since signup
bins <- c(2, 6, 11, 17, 22, Inf)
labels <- c("new", "<6M", "<12M", "<18M", "<21M")
data$signup_group <- cut(data$time_since_signup, breaks = bins, labels = labels)

# Display the new column
head(data$signup_group)
## [1] <18M <12M new  <6M  <18M <21M
## Levels: new <6M <12M <18M <21M
summary(data$signup_group)
##  new  <6M <12M <18M <21M NA's 
##  145  213  216  225   90  111
#split the data into the information prior to the onset of the experiment
# create a vector of all the dates we want to include in the "before_18_Jul" dataset
target_dates <- c("1-Jul", "2-Jul", "3-Jul", "4-Jul", "5-Jul", "6-Jul", "7-Jul", "8-Jul", "9-Jul", "10-Jul", "11-Jul", "12-Jul", "13-Jul", "14-Jul", "15-Jul", "16-Jul", "17-Jul")

# filter the data into two datasets based on the date column
before_18_Jul <- data %>% filter(date %in% target_dates)
after_18_Jul <- data %>% filter(!date %in% target_dates)

head(before_18_Jul)
##    date gender age social_metric time_since_signup demographic group
## 1 1-Jul      F  28             5              19.3           1     A
## 2 1-Jul      F  32             7              11.5           1     A
## 3 1-Jul      F  39             4               4.3           3     A
## 4 1-Jul      M  52            10               9.5           4     A
## 5 1-Jul      M  25             1              19.5           2     A
## 6 1-Jul      M  51             0              22.6           4     A
##   hours_watched signup_group
## 1          4.08         <18M
## 2          2.99         <12M
## 3          5.74          new
## 4          4.13          <6M
## 5          4.68         <18M
## 6          3.40         <21M
head(after_18_Jul)
##     date gender age social_metric time_since_signup demographic group
## 1 18-Jul      F  39             5              14.8           3     B
## 2 18-Jul      M  45             7               2.4           4     A
## 3 18-Jul      F  50             2               6.6           3     A
## 4 18-Jul      M  39             1              18.7           4     A
## 5 18-Jul      F  18             9              10.5           1     A
## 6 18-Jul      F  52             2               5.3           3     A
##   hours_watched signup_group
## 1          3.74         <12M
## 2          4.00          new
## 3          3.66          <6M
## 4          3.58         <18M
## 5          6.64          <6M
## 6          3.36          new

# Boxplot for signup_group before_18_Jul
#before_18_Julbox <- subset(before_18_Jul, before_18_Jul$signup_group == "before_18_Jul")
boxplot_data1 <- list(hours_watched = before_18_Jul$hours_watched, signup_group = before_18_Jul$signup_group)
before_18_Jul_boxplot <- ggplot(data = data.frame(boxplot_data1), aes(x = signup_group, y = hours_watched, fill = signup_group)) + 
  geom_boxplot() +
  ggtitle("Before 18 Jul") +
  xlab("Signup Group")

# Boxplots for signup_group after_18_Jul, split by group A and group B
#after_18_Julbox <- subset(after_18_Jul, after_18_Jul$signup_group != "before_18_Jul")
boxplot_data2 <- list(hours_watched = after_18_Jul$hours_watched, signup_group = after_18_Jul$signup_group, group = after_18_Jul$group)
after_18_Jul_boxplot <- ggplot(data = data.frame(boxplot_data2), aes(x = signup_group, y = hours_watched, fill = group)) + 
  geom_boxplot() +
  ggtitle("After 18 Jul") +
  xlab("Signup Group")

# Combine the three boxplots
boxplot_combined <- cowplot::plot_grid(before_18_Jul_boxplot, after_18_Jul_boxplot, ncol = 2, align = "v")

# Show the combined boxplots
boxplot_combined

## 
## Call:
## lm(formula = hours_watched ~ group + signup_group, data = after_18_Jul)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8914 -0.8546 -0.0391  0.8867  3.9086 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       4.24913    0.16098  26.395   <2e-16 ***
## groupB            0.32414    0.15337   2.113   0.0352 *  
## signup_group<6M   0.14002    0.20487   0.683   0.4947    
## signup_group<12M  0.45008    0.21194   2.124   0.0343 *  
## signup_group<18M  0.14225    0.20804   0.684   0.4945    
## signup_group<21M -0.05453    0.26227  -0.208   0.8354    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.347 on 399 degrees of freedom
##   (47 observations deleted due to missingness)
## Multiple R-squared:  0.02642,    Adjusted R-squared:  0.01422 
## F-statistic: 2.166 on 5 and 399 DF,  p-value: 0.05714
## Analysis of Variance Table
## 
## Response: hours_watched
##               Df Sum Sq Mean Sq F value  Pr(>F)  
## group          1   8.31  8.3134  4.5810 0.03293 *
## signup_group   4  11.34  2.8342  1.5617 0.18372  
## Residuals    399 724.09  1.8148                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Subset the after_18_Jul data into separate data frames for groups A and B
after_18_Jul_groupA <- subset(after_18_Jul, group == "A")
after_18_Jul_groupB <- subset(after_18_Jul, group == "B")

# Perform a two-sample t-test to compare the mean watch hours of groups A and B
t.test(after_18_Jul$hours_watched, after_18_Jul_groupB$hours_watched, paired=FALSE, var.equal=FALSE, na.rm=TRUE)
## 
##  Welch Two Sample t-test
## 
## data:  after_18_Jul$hours_watched and after_18_Jul_groupB$hours_watched
## t = -2.1843, df = 192.21, p-value = 0.03015
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.57160852 -0.02914591
## sample estimates:
## mean of x mean of y 
##  4.510498  4.810875
t.test(after_18_Jul$hours_watched, after_18_Jul_groupA$hours_watched, paired=FALSE, var.equal=FALSE, na.rm=TRUE)
## 
##  Welch Two Sample t-test
## 
## data:  after_18_Jul$hours_watched and after_18_Jul_groupA$hours_watched
## t = 1.0907, df = 712.68, p-value = 0.2758
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.08685188  0.30399204
## sample estimates:
## mean of x mean of y 
##  4.510498  4.401928
# Subset the data to only include group A and group B
after_18_Jul_groupA <- after_18_Jul %>% filter(group == "A")
after_18_Jul_groupB <- after_18_Jul %>% filter(group == "B")

# Perform a two-sample t-test to compare the means of the two groups
t.test(after_18_Jul_groupA$hours_watched, after_18_Jul_groupB$hours_watched, alternative = "less")
## 
##  Welch Two Sample t-test
## 
## data:  after_18_Jul_groupA$hours_watched and after_18_Jul_groupB$hours_watched
## t = -2.8602, df = 217.62, p-value = 0.002323
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##       -Inf -0.172761
## sample estimates:
## mean of x mean of y 
##  4.401928  4.810875
# Calculate the mean watchtime for Group A
mean_after_18_Jul_groupA <- mean(after_18_Jul_groupA$hours_watched, na.rm = TRUE)
cat("Mean watch time for Group A:", mean_after_18_Jul_groupA, "\n")
## Mean watch time for Group A: 4.401928
# Calculate the mean watchtime for Group B
mean_after_18_Jul_groupB <- mean(after_18_Jul_groupB$hours_watched, na.rm = TRUE)
cat("Mean watch time for Group B:", mean_after_18_Jul_groupB, "\n")
## Mean watch time for Group B: 4.810875
# Compare the means using a bar chart
means <- data.frame(
  group = c("Group A", "Group B"),
  mean_watch_time = c(mean_after_18_Jul_groupA, mean_after_18_Jul_groupB)
)

ggplot(means, aes(x = group, y = mean_watch_time)) +
  geom_bar(stat = "identity") +
  labs(title = "Comparison of Mean Watch Time between Group A and Group B", y = "Mean Watch Time")

ggplot(after_18_Jul, aes(x = group, y = hours_watched, fill = group)) +
  geom_boxplot() +
  scale_fill_manual(values = c("red", "blue"))

# Create a new data frame with only the columns we are interested in
regression_data <- after_18_Jul %>% 
  select(group, hours_watched, gender, age, social_metric, time_since_signup, demographic)

# Fit a multiple linear regression model
model <- lm(hours_watched ~ group + gender + age + social_metric + time_since_signup + demographic, data = regression_data)

# Print the model summary
summary(model)
## 
## Call:
## lm(formula = hours_watched ~ group + gender + age + social_metric + 
##     time_since_signup + demographic, data = regression_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.75596 -0.60838 -0.00599  0.66732  2.76992 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        6.468445   0.223421  28.952  < 2e-16 ***
## groupB             0.640662   0.113736   5.633 3.15e-08 ***
## genderM            0.150426   0.137564   1.093   0.2748    
## age               -0.065597   0.008866  -7.399 6.90e-13 ***
## social_metric      0.110930   0.016091   6.894 1.87e-11 ***
## time_since_signup  0.005748   0.006793   0.846   0.3979    
## demographic       -0.167337   0.097407  -1.718   0.0865 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.029 on 445 degrees of freedom
## Multiple R-squared:  0.4476, Adjusted R-squared:  0.4401 
## F-statistic: 60.09 on 6 and 445 DF,  p-value: < 2.2e-16
## 
##  Welch Two Sample t-test
## 
## data:  before_18_Jul$hours_watched and after_18_Jul[after_18_Jul$group == "B", "hours_watched"]
## t = -3.8615, df = 171.58, p-value = 0.0001594
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.7776705 -0.2515612
## sample estimates:
## mean of x mean of y 
##  4.296259  4.810875

Discussion

Future directions