df<- read.csv("C:/Users/Jason/Desktop/Grad Cert Data Science/2. Applied Analytics/Assignment 3/data/streaming_data.csv")
head(df)
##         date gender age social_metric time_since_signup demographic group
## 1 2021-07-01      F  28             5              19.3           1     A
## 2 2021-07-01      F  32             7              11.5           1     A
## 3 2021-07-01      F  39             4               4.3           3     A
## 4 2021-07-01      M  52            10               9.5           4     A
## 5 2021-07-01      M  25             1              19.5           2     A
## 6 2021-07-01      M  51             0              22.6           4     A
##   hours_watched
## 1          4.08
## 2          2.99
## 3          5.74
## 4          4.13
## 5          4.68
## 6          3.40

Preprocess the Data

#check data loaded correctly
dim(df)
## [1] 1000    8
#check data types
sapply(df, class)
##              date            gender               age     social_metric 
##       "character"       "character"         "integer"         "integer" 
## time_since_signup       demographic             group     hours_watched 
##         "numeric"         "integer"       "character"         "numeric"
#Reassign characters as factors
df$date<-as.Date(df$date)
df$gender<-as.factor(df$gender)
df$group<-as.factor(df$group)

#summary
summary(df)
##       date            gender       age        social_metric   
##  Min.   :2021-07-01   F:429   Min.   :18.00   Min.   : 0.000  
##  1st Qu.:2021-07-08   M:571   1st Qu.:28.00   1st Qu.: 2.000  
##  Median :2021-07-16           Median :36.00   Median : 5.000  
##  Mean   :2021-07-16           Mean   :36.49   Mean   : 4.911  
##  3rd Qu.:2021-07-24           3rd Qu.:46.00   3rd Qu.: 8.000  
##  Max.   :2021-07-31           Max.   :55.00   Max.   :10.000  
##  time_since_signup  demographic    group   hours_watched  
##  Min.   : 0.00     Min.   :1.000   A:880   Min.   :0.500  
##  1st Qu.: 5.70     1st Qu.:2.000   B:120   1st Qu.:3.530  
##  Median :11.80     Median :3.000           Median :4.415  
##  Mean   :11.97     Mean   :2.603           Mean   :4.393  
##  3rd Qu.:18.70     3rd Qu.:4.000           3rd Qu.:5.322  
##  Max.   :24.00     Max.   :4.000           Max.   :8.300
#number of samples per day
tally<-count(df,date)
min(tally$n)
## [1] 32
max(tally$n)
## [1] 33
#mutate demographic
df<-mutate(df, demographic=ifelse(df$demographic==1, "Younger Female", ifelse(df$demographic==2, "Younger Male", ifelse( df$demographic==3, "Older Female", ifelse(df$demographic==4, "Older Male","")))))

Initial data - Trend by total group

#calculate average hours watched per day
meanhrs_totalds<-aggregate(hours_watched~date,df,mean)

#plot average hours watched by day
ggplot(meanhrs_totalds, 
       aes(x = date, 
           y = hours_watched))+
      labs( title="Average Daily Hours Watched",
           x = "Date",
           y = "Average Hours") +
  geom_point() +
  stat_smooth() +
  geom_vline(xintercept=as.numeric(meanhrs_totalds$date[18]), linetype=4, colour='red')

# Now plotted as regression
ggplot(meanhrs_totalds,
       aes(x = date,
           y = hours_watched)) +
  labs( title="Average Daily Hours Watched - Regression",
           x = "Date",
           y = "Average Hours") +
  geom_point() +
  stat_smooth(method = lm)

Trend by Group and Date

#Average hours watched by group
meanhrs_grp<-df%>% group_by(group, date)%>%summarise(hours_watched=mean(hours_watched))

ggplot(meanhrs_grp,
       aes(x = date,
           y = hours_watched,
           color = group,
           group = group)) +
  labs( title="Average Daily Hours Watched by Group",
           x = "Date",
           y = "Average Hours") +
  geom_point() +
  stat_smooth() +
   geom_vline(xintercept=as.numeric(meanhrs_totalds$date[18]), linetype=4, colour='red')

# Now plotted as regression
ggplot(meanhrs_grp,
       aes(x = date,
           y = hours_watched,
           color = group,
           group = group)) +
  labs( title="Average Daily Hours Watched by Group - Regression",
           x = "Date",
           y = "Average Hours") +
  geom_point() +
  stat_smooth(method = lm)

#Average hours watched by gender
meanhrs_gender<-df%>% group_by(gender, date)%>%summarise(hours_watched=mean(hours_watched))

ggplot(meanhrs_gender,
       aes(x = date,
           y = hours_watched,
           color = gender,
           group = gender)) +
  labs( title="Average Daily Hours Watched by Gender",
           x = "Date",
           y = "Average Hours") +
  geom_point() +
  stat_smooth() +
  geom_vline(xintercept=as.numeric(meanhrs_totalds$date[18]), linetype=4, colour='red')

# Now plotted as regression
ggplot(meanhrs_gender,
       aes(x = date,
           y = hours_watched,
           color = gender,
           group = gender)) +
  labs( title="Average Daily Hours Watched by Gender - Regression",
           x = "Date",
           y = "Average Hours") +
  geom_point() +
  stat_smooth(method = lm)

#Average hours watched by age
meanhrs_age_pre<-df%>% filter(date<="2021-07-18")%>%group_by(age)%>%summarise(hours_watched=mean(hours_watched))
meanhrs_age_post<-df%>% filter(date>="2021-07-18")%>%group_by(age)%>%summarise(hours_watched=mean(hours_watched))

ggplot(meanhrs_age_pre,
       aes(x = age,
           y = hours_watched)) +
    labs( title="Average Daily Hours Watched by Age - Pre A/B",
           x = "Age",
           y = "Average Hours") +
  geom_point() +
  stat_smooth()

ggplot(meanhrs_age_post,
       aes(x = age,
           y = hours_watched)) +
    labs( title="Average Daily Hours Watched by Age - During A/B",
           x = "Age",
           y = "Average Hours") +
  geom_point() +
  stat_smooth()

meanhrs_age_comp<-full_join(meanhrs_age_pre, meanhrs_age_post, by='age')

#v1
ggplot() + 
  geom_line(data = meanhrs_age_pre, aes(x = age, y = hours_watched, color = "Pre-A/B")) +
  geom_line(data = meanhrs_age_post, aes(x = age, y = hours_watched, color = "Post-A/B")) +
  labs( title="Average Daily Hours Watched by Age - Pre v Post A/B",
           x = "Age",
           y = "Average Hours")

#Average hours watched by demographic
meanhrs_dem<-df%>% group_by(demographic, date)%>%summarise(hours_watched=mean(hours_watched))

ggplot(meanhrs_dem,
       aes(x = date,
           y = hours_watched,
           color = demographic,
           group = demographic)) +
  labs( title="Average Daily Hours Watched by Demographic",
           x = "Demographic",
           y = "Average Hours") +
  geom_point() +
  stat_smooth() +
  geom_vline(xintercept=as.numeric(meanhrs_dem$date[18]), linetype=4, colour='red')

# Now plotted as regression
ggplot(meanhrs_dem,
       aes(x = date,
           y = hours_watched,
           color = demographic,
           group = demographic)) +
    labs( title="Average Daily Hours Watched by Demographic",
           x = "Demographic",
           y = "Average Hours") +
  geom_point() +
  stat_smooth(method = lm)

#include groups in dataset
df$demgroup<-ifelse(df$demographic=="Younger Female"|1 & df$group=="A", "GroupA YF", ifelse(df$demographic=="Younger Male"|2 & df$group=="A", "GroupA YM", ifelse( df$demographic=="Older Female"|3 & df$group=="A", "GroupA OF", ifelse(df$demographic=="Older Male" |4 & df$group =="A","GroupA OM", ifelse(df$demographic=="Younger Female"|1 & df$group=="B", "GroupB YF",ifelse(df$demographic=="Younger Male"|2 & df$group=="B", "GroupB YM", ifelse( df$demographic=="Older Female"|3 & df$group=="B", "GroupB OF", ifelse(df$demographic=="Older Male"|4 & df$group =="B","GroupB OM",""))))))))
  
meanhrs_demgroup<-df%>% group_by(demgroup, date)%>%summarise(hours_watched=mean(hours_watched))

#Young Females
meanhrs_YF<-df%>% group_by(demgroup, date)%>%filter(demographic=="Younger Female")%>%summarise(hours_watched=mean(hours_watched))

ggplot(meanhrs_YF,
       aes(x = date,
           y = hours_watched,
           color = demgroup,
           group = demgroup)) +
  labs( title="Average Daily Hours Watched by Young Females",
           x = "Demographic",
           y = "Average Hours") +
  geom_point() +
  stat_smooth(method=lm)+
  geom_vline(xintercept=as.numeric(meanhrs_dem$date[18]), linetype=4, colour='red')

#Young Males
meanhrs_YM<-df%>% group_by(demgroup, date)%>%filter(demographic=="Younger Male")%>%summarise(hours_watched=mean(hours_watched))

ggplot(meanhrs_YM,
       aes(x = date,
           y = hours_watched,
           color = demgroup,
           group = demgroup)) +
  labs( title="Average Daily Hours Watched by Young Males",
           x = "Demographic",
           y = "Average Hours") +
  geom_point() +
  stat_smooth(method=lm)+
  geom_vline(xintercept=as.numeric(meanhrs_dem$date[18]), linetype=4, colour='red')

#Old Females
meanhrs_OF<-df%>% group_by(demgroup, date)%>%filter(demographic=="Older Female")%>%summarise(hours_watched=mean(hours_watched))

ggplot(meanhrs_OF,
       aes(x = date,
           y = hours_watched,
           color = demgroup,
           group = demgroup)) +
  labs( title="Average Daily Hours Watched by Older Females",
           x = "Demographic",
           y = "Average Hours") +
  geom_point() +
  stat_smooth(method=lm)+
  geom_vline(xintercept=as.numeric(meanhrs_dem$date[18]), linetype=4, colour='red')

#Old Males
meanhrs_OM<-df%>% group_by(demgroup, date)%>%filter(demographic=="Older Male")%>%summarise(hours_watched=mean(hours_watched))

ggplot(meanhrs_OM,
       aes(x = date,
           y = hours_watched,
           color = demgroup,
           group = demgroup)) +
  labs( title="Average Daily Hours Watched by Older Males",
           x = "Demographic",
           y = "Average Hours") +
  geom_point() +
  stat_smooth(method=lm)+
  geom_vline(xintercept=as.numeric(meanhrs_dem$date[18]), linetype=4, colour='red')

Potential Bias Error

Age Group Bias

#summarise tables
df$agegroup<-ifelse(df$age<=35, "Younger","Older")

#agegroup proportion pre-test
ageprop_pretest<-df%>%subset(date>="2021-07-01" & date <= "2021-07-17")%>%filter(group=="A")%>%count(group, agegroup)%>%mutate(prop=prop.table(n))

#agegroup proportion post-test - Group A
agepropA_test<-df%>%subset(date>="2021-07-18" & date <= "2021-07-31")%>%filter(group=="A")%>%count(group, agegroup)%>%mutate(prop=prop.table(n))

#agegroup proportion post-test - Group B
agepropB_test<-df%>%subset(date>="2021-07-18" & date <= "2021-07-31")%>%filter(group=="B")%>%count(group, agegroup)%>%mutate(prop=prop.table(n))

age_bias<-full_join(agepropA_test, agepropB_test, by='agegroup')
age_bias
##   group.x agegroup n.x    prop.x group.y n.y prop.y
## 1       A    Older 162 0.4879518       B  75  0.625
## 2       A  Younger 170 0.5120482       B  45  0.375

Gender Bias

#pretest split
df%>%subset(date>="2021-07-01" & date <= "2021-07-17")%>%filter(group=="A")%>%count(group, gender)%>%mutate(prop=prop.table(n))
##   group gender   n      prop
## 1     A      F 237 0.4324818
## 2     A      M 311 0.5675182
#test period split
gbA<-df%>%subset(date>="2021-07-18" & date <= "2021-07-31")%>%filter(group=="A")%>%count(group, gender)%>%mutate(prop=prop.table(n))
gbB<-df%>%subset(date>="2021-07-18" & date <= "2021-07-31")%>%filter(group=="B")%>%count(group, gender)%>%mutate(prop=prop.table(n))
gender_bias<-full_join(gbA, gbB, by='gender')
gender_bias
##   group.x gender n.x    prop.x group.y n.y    prop.y
## 1       A      F 163 0.4909639       B  29 0.2416667
## 2       A      M 169 0.5090361       B  91 0.7583333

Demographic Bias

df<-mutate(df, demographic=ifelse(df$demographic==1, "Younger Female", ifelse(df$demographic==2, "Younger Male", ifelse( df$demographic==3, "Older Female", ifelse(df$demographic==4, "Older Male","")))))

#pre-test
df%>%subset(date>="2021-07-01" & date <= "2021-07-17")%>%filter(group=="A")%>%count(group, demographic)%>%mutate(prop=prop.table(n))
##   group demographic   n prop
## 1     A             548    1
#test
dbA<-df%>%subset(date>="2021-07-18" & date <= "2021-07-31")%>%filter(group=="A")%>%count(group, demographic)%>%mutate(prop=prop.table(n))
dbB<-df%>%subset(date>="2021-07-18" & date <= "2021-07-31")%>%filter(group=="B")%>%count(group, demographic)%>%mutate(prop=prop.table(n))
demog_bias<-full_join(dbA, dbB, by='demographic')
demog_bias
##   group.x demographic n.x prop.x group.y n.y prop.y
## 1       A             332      1       B 120      1

Hypothesis Testing

Chi Square

#Chi Square for group
groupchi = table(df$group, df$hours_watched) 
chisq.test(groupchi)
## 
##  Pearson's Chi-squared test
## 
## data:  groupchi
## X-squared = 761.37, df = 500, p-value = 3.754e-13
#Chi Square for demographic
demochi = table(df$demographic, df$hours_watched) 
chisq.test(demochi)
## 
##  Chi-squared test for given probabilities
## 
## data:  demochi
## X-squared = 394.78, df = 500, p-value = 0.9998

ANOVA

#Anova - Is there any significant difference between the average hours watched by group?
res.aov<-aov(hours_watched~group, data=df)
summary(res.aov)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## group         1   23.8  23.801   13.56 0.000243 ***
## Residuals   998 1751.6   1.755                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(res.aov)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = hours_watched ~ group, data = df)
## 
## $group
##        diff       lwr       upr     p adj
## B-A 0.47475 0.2217629 0.7277371 0.0002434
#Anova - Is there any significant difference between the average hours watched by group for Young Males?
AnovaYM<-df%>%subset(date>="2021-07-18" & date <= "2021-07-31")%>%filter(demographic=="Younger Male")
summary(res.aov)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## group         1   23.8  23.801   13.56 0.000243 ***
## Residuals   998 1751.6   1.755                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(res.aov)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = hours_watched ~ group, data = df)
## 
## $group
##        diff       lwr       upr     p adj
## B-A 0.47475 0.2217629 0.7277371 0.0002434
#Anova - Is there any significant difference between the average hours watched by group for Young Females?
AnovaYF<-df%>%subset(date>="2021-07-18" & date <= "2021-07-31")%>%filter(demographic=="Younger Female")
summary(res.aov)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## group         1   23.8  23.801   13.56 0.000243 ***
## Residuals   998 1751.6   1.755                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Anova - Is there any significant difference between the average hours watched by group for Old Males?
AnovaOM<-df%>%subset(date>="2021-07-18" & date <= "2021-07-31")%>%filter(demographic=="Older Male")
summary(res.aov)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## group         1   23.8  23.801   13.56 0.000243 ***
## Residuals   998 1751.6   1.755                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(res.aov)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = hours_watched ~ group, data = df)
## 
## $group
##        diff       lwr       upr     p adj
## B-A 0.47475 0.2217629 0.7277371 0.0002434
#Anova - Is there any significant difference between the average hours watched by group for Old Females?
AnovaOF<-df%>%subset(date>="2021-07-18" & date <= "2021-07-31")%>%filter(demographic=="Older Female")
summary(res.aov)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## group         1   23.8  23.801   13.56 0.000243 ***
## Residuals   998 1751.6   1.755                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1