Preprocess the Data
#check data loaded correctly
dim(df)
## [1] 1000 8
#check data types
sapply(df, class)
## date gender age social_metric
## "character" "character" "integer" "integer"
## time_since_signup demographic group hours_watched
## "numeric" "integer" "character" "numeric"
#Reassign characters as factors
df$date<-as.Date(df$date)
df$gender<-as.factor(df$gender)
df$group<-as.factor(df$group)
#summary
summary(df)
## date gender age social_metric
## Min. :2021-07-01 F:429 Min. :18.00 Min. : 0.000
## 1st Qu.:2021-07-08 M:571 1st Qu.:28.00 1st Qu.: 2.000
## Median :2021-07-16 Median :36.00 Median : 5.000
## Mean :2021-07-16 Mean :36.49 Mean : 4.911
## 3rd Qu.:2021-07-24 3rd Qu.:46.00 3rd Qu.: 8.000
## Max. :2021-07-31 Max. :55.00 Max. :10.000
## time_since_signup demographic group hours_watched
## Min. : 0.00 Min. :1.000 A:880 Min. :0.500
## 1st Qu.: 5.70 1st Qu.:2.000 B:120 1st Qu.:3.530
## Median :11.80 Median :3.000 Median :4.415
## Mean :11.97 Mean :2.603 Mean :4.393
## 3rd Qu.:18.70 3rd Qu.:4.000 3rd Qu.:5.322
## Max. :24.00 Max. :4.000 Max. :8.300
#number of samples per day
tally<-count(df,date)
min(tally$n)
## [1] 32
max(tally$n)
## [1] 33
#mutate demographic
df<-mutate(df, demographic=ifelse(df$demographic==1, "Younger Female", ifelse(df$demographic==2, "Younger Male", ifelse( df$demographic==3, "Older Female", ifelse(df$demographic==4, "Older Male","")))))
Trend by Group and Date
#Average hours watched by group
meanhrs_grp<-df%>% group_by(group, date)%>%summarise(hours_watched=mean(hours_watched))
ggplot(meanhrs_grp,
aes(x = date,
y = hours_watched,
color = group,
group = group)) +
labs( title="Average Daily Hours Watched by Group",
x = "Date",
y = "Average Hours") +
geom_point() +
stat_smooth() +
geom_vline(xintercept=as.numeric(meanhrs_totalds$date[18]), linetype=4, colour='red')

# Now plotted as regression
ggplot(meanhrs_grp,
aes(x = date,
y = hours_watched,
color = group,
group = group)) +
labs( title="Average Daily Hours Watched by Group - Regression",
x = "Date",
y = "Average Hours") +
geom_point() +
stat_smooth(method = lm)

#Average hours watched by gender
meanhrs_gender<-df%>% group_by(gender, date)%>%summarise(hours_watched=mean(hours_watched))
ggplot(meanhrs_gender,
aes(x = date,
y = hours_watched,
color = gender,
group = gender)) +
labs( title="Average Daily Hours Watched by Gender",
x = "Date",
y = "Average Hours") +
geom_point() +
stat_smooth() +
geom_vline(xintercept=as.numeric(meanhrs_totalds$date[18]), linetype=4, colour='red')

# Now plotted as regression
ggplot(meanhrs_gender,
aes(x = date,
y = hours_watched,
color = gender,
group = gender)) +
labs( title="Average Daily Hours Watched by Gender - Regression",
x = "Date",
y = "Average Hours") +
geom_point() +
stat_smooth(method = lm)

#Average hours watched by age
meanhrs_age_pre<-df%>% filter(date<="2021-07-18")%>%group_by(age)%>%summarise(hours_watched=mean(hours_watched))
meanhrs_age_post<-df%>% filter(date>="2021-07-18")%>%group_by(age)%>%summarise(hours_watched=mean(hours_watched))
ggplot(meanhrs_age_pre,
aes(x = age,
y = hours_watched)) +
labs( title="Average Daily Hours Watched by Age - Pre A/B",
x = "Age",
y = "Average Hours") +
geom_point() +
stat_smooth()

ggplot(meanhrs_age_post,
aes(x = age,
y = hours_watched)) +
labs( title="Average Daily Hours Watched by Age - During A/B",
x = "Age",
y = "Average Hours") +
geom_point() +
stat_smooth()

meanhrs_age_comp<-full_join(meanhrs_age_pre, meanhrs_age_post, by='age')
#v1
ggplot() +
geom_line(data = meanhrs_age_pre, aes(x = age, y = hours_watched, color = "Pre-A/B")) +
geom_line(data = meanhrs_age_post, aes(x = age, y = hours_watched, color = "Post-A/B")) +
labs( title="Average Daily Hours Watched by Age - Pre v Post A/B",
x = "Age",
y = "Average Hours")

#Average hours watched by demographic
meanhrs_dem<-df%>% group_by(demographic, date)%>%summarise(hours_watched=mean(hours_watched))
ggplot(meanhrs_dem,
aes(x = date,
y = hours_watched,
color = demographic,
group = demographic)) +
labs( title="Average Daily Hours Watched by Demographic",
x = "Demographic",
y = "Average Hours") +
geom_point() +
stat_smooth() +
geom_vline(xintercept=as.numeric(meanhrs_dem$date[18]), linetype=4, colour='red')

# Now plotted as regression
ggplot(meanhrs_dem,
aes(x = date,
y = hours_watched,
color = demographic,
group = demographic)) +
labs( title="Average Daily Hours Watched by Demographic",
x = "Demographic",
y = "Average Hours") +
geom_point() +
stat_smooth(method = lm)

#include groups in dataset
df$demgroup<-ifelse(df$demographic=="Younger Female"|1 & df$group=="A", "GroupA YF", ifelse(df$demographic=="Younger Male"|2 & df$group=="A", "GroupA YM", ifelse( df$demographic=="Older Female"|3 & df$group=="A", "GroupA OF", ifelse(df$demographic=="Older Male" |4 & df$group =="A","GroupA OM", ifelse(df$demographic=="Younger Female"|1 & df$group=="B", "GroupB YF",ifelse(df$demographic=="Younger Male"|2 & df$group=="B", "GroupB YM", ifelse( df$demographic=="Older Female"|3 & df$group=="B", "GroupB OF", ifelse(df$demographic=="Older Male"|4 & df$group =="B","GroupB OM",""))))))))
meanhrs_demgroup<-df%>% group_by(demgroup, date)%>%summarise(hours_watched=mean(hours_watched))
#Young Females
meanhrs_YF<-df%>% group_by(demgroup, date)%>%filter(demographic=="Younger Female")%>%summarise(hours_watched=mean(hours_watched))
ggplot(meanhrs_YF,
aes(x = date,
y = hours_watched,
color = demgroup,
group = demgroup)) +
labs( title="Average Daily Hours Watched by Young Females",
x = "Demographic",
y = "Average Hours") +
geom_point() +
stat_smooth(method=lm)+
geom_vline(xintercept=as.numeric(meanhrs_dem$date[18]), linetype=4, colour='red')

#Young Males
meanhrs_YM<-df%>% group_by(demgroup, date)%>%filter(demographic=="Younger Male")%>%summarise(hours_watched=mean(hours_watched))
ggplot(meanhrs_YM,
aes(x = date,
y = hours_watched,
color = demgroup,
group = demgroup)) +
labs( title="Average Daily Hours Watched by Young Males",
x = "Demographic",
y = "Average Hours") +
geom_point() +
stat_smooth(method=lm)+
geom_vline(xintercept=as.numeric(meanhrs_dem$date[18]), linetype=4, colour='red')

#Old Females
meanhrs_OF<-df%>% group_by(demgroup, date)%>%filter(demographic=="Older Female")%>%summarise(hours_watched=mean(hours_watched))
ggplot(meanhrs_OF,
aes(x = date,
y = hours_watched,
color = demgroup,
group = demgroup)) +
labs( title="Average Daily Hours Watched by Older Females",
x = "Demographic",
y = "Average Hours") +
geom_point() +
stat_smooth(method=lm)+
geom_vline(xintercept=as.numeric(meanhrs_dem$date[18]), linetype=4, colour='red')

#Old Males
meanhrs_OM<-df%>% group_by(demgroup, date)%>%filter(demographic=="Older Male")%>%summarise(hours_watched=mean(hours_watched))
ggplot(meanhrs_OM,
aes(x = date,
y = hours_watched,
color = demgroup,
group = demgroup)) +
labs( title="Average Daily Hours Watched by Older Males",
x = "Demographic",
y = "Average Hours") +
geom_point() +
stat_smooth(method=lm)+
geom_vline(xintercept=as.numeric(meanhrs_dem$date[18]), linetype=4, colour='red')

Potential Bias Error
Age Group Bias
#summarise tables
df$agegroup<-ifelse(df$age<=35, "Younger","Older")
#agegroup proportion pre-test
ageprop_pretest<-df%>%subset(date>="2021-07-01" & date <= "2021-07-17")%>%filter(group=="A")%>%count(group, agegroup)%>%mutate(prop=prop.table(n))
#agegroup proportion post-test - Group A
agepropA_test<-df%>%subset(date>="2021-07-18" & date <= "2021-07-31")%>%filter(group=="A")%>%count(group, agegroup)%>%mutate(prop=prop.table(n))
#agegroup proportion post-test - Group B
agepropB_test<-df%>%subset(date>="2021-07-18" & date <= "2021-07-31")%>%filter(group=="B")%>%count(group, agegroup)%>%mutate(prop=prop.table(n))
age_bias<-full_join(agepropA_test, agepropB_test, by='agegroup')
age_bias
## group.x agegroup n.x prop.x group.y n.y prop.y
## 1 A Older 162 0.4879518 B 75 0.625
## 2 A Younger 170 0.5120482 B 45 0.375
Gender Bias
#pretest split
df%>%subset(date>="2021-07-01" & date <= "2021-07-17")%>%filter(group=="A")%>%count(group, gender)%>%mutate(prop=prop.table(n))
## group gender n prop
## 1 A F 237 0.4324818
## 2 A M 311 0.5675182
#test period split
gbA<-df%>%subset(date>="2021-07-18" & date <= "2021-07-31")%>%filter(group=="A")%>%count(group, gender)%>%mutate(prop=prop.table(n))
gbB<-df%>%subset(date>="2021-07-18" & date <= "2021-07-31")%>%filter(group=="B")%>%count(group, gender)%>%mutate(prop=prop.table(n))
gender_bias<-full_join(gbA, gbB, by='gender')
gender_bias
## group.x gender n.x prop.x group.y n.y prop.y
## 1 A F 163 0.4909639 B 29 0.2416667
## 2 A M 169 0.5090361 B 91 0.7583333
Demographic Bias
df<-mutate(df, demographic=ifelse(df$demographic==1, "Younger Female", ifelse(df$demographic==2, "Younger Male", ifelse( df$demographic==3, "Older Female", ifelse(df$demographic==4, "Older Male","")))))
#pre-test
df%>%subset(date>="2021-07-01" & date <= "2021-07-17")%>%filter(group=="A")%>%count(group, demographic)%>%mutate(prop=prop.table(n))
## group demographic n prop
## 1 A 548 1
#test
dbA<-df%>%subset(date>="2021-07-18" & date <= "2021-07-31")%>%filter(group=="A")%>%count(group, demographic)%>%mutate(prop=prop.table(n))
dbB<-df%>%subset(date>="2021-07-18" & date <= "2021-07-31")%>%filter(group=="B")%>%count(group, demographic)%>%mutate(prop=prop.table(n))
demog_bias<-full_join(dbA, dbB, by='demographic')
demog_bias
## group.x demographic n.x prop.x group.y n.y prop.y
## 1 A 332 1 B 120 1
Hypothesis Testing
Chi Square
#Chi Square for group
groupchi = table(df$group, df$hours_watched)
chisq.test(groupchi)
##
## Pearson's Chi-squared test
##
## data: groupchi
## X-squared = 761.37, df = 500, p-value = 3.754e-13
#Chi Square for demographic
demochi = table(df$demographic, df$hours_watched)
chisq.test(demochi)
##
## Chi-squared test for given probabilities
##
## data: demochi
## X-squared = 394.78, df = 500, p-value = 0.9998
ANOVA
#Anova - Is there any significant difference between the average hours watched by group?
res.aov<-aov(hours_watched~group, data=df)
summary(res.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## group 1 23.8 23.801 13.56 0.000243 ***
## Residuals 998 1751.6 1.755
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(res.aov)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = hours_watched ~ group, data = df)
##
## $group
## diff lwr upr p adj
## B-A 0.47475 0.2217629 0.7277371 0.0002434
#Anova - Is there any significant difference between the average hours watched by group for Young Males?
AnovaYM<-df%>%subset(date>="2021-07-18" & date <= "2021-07-31")%>%filter(demographic=="Younger Male")
summary(res.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## group 1 23.8 23.801 13.56 0.000243 ***
## Residuals 998 1751.6 1.755
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(res.aov)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = hours_watched ~ group, data = df)
##
## $group
## diff lwr upr p adj
## B-A 0.47475 0.2217629 0.7277371 0.0002434
#Anova - Is there any significant difference between the average hours watched by group for Young Females?
AnovaYF<-df%>%subset(date>="2021-07-18" & date <= "2021-07-31")%>%filter(demographic=="Younger Female")
summary(res.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## group 1 23.8 23.801 13.56 0.000243 ***
## Residuals 998 1751.6 1.755
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Anova - Is there any significant difference between the average hours watched by group for Old Males?
AnovaOM<-df%>%subset(date>="2021-07-18" & date <= "2021-07-31")%>%filter(demographic=="Older Male")
summary(res.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## group 1 23.8 23.801 13.56 0.000243 ***
## Residuals 998 1751.6 1.755
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(res.aov)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = hours_watched ~ group, data = df)
##
## $group
## diff lwr upr p adj
## B-A 0.47475 0.2217629 0.7277371 0.0002434
#Anova - Is there any significant difference between the average hours watched by group for Old Females?
AnovaOF<-df%>%subset(date>="2021-07-18" & date <= "2021-07-31")%>%filter(demographic=="Older Female")
summary(res.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## group 1 23.8 23.801 13.56 0.000243 ***
## Residuals 998 1751.6 1.755
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1