df <- read_csv("C:/Users/61423/Downloads/streaming_data.csv",
col_types = cols(gender = col_factor(levels = c("M","F")),
age = col_integer(), social_metric = col_integer(),demographic = col_integer(),
group = col_factor(levels = c("A","B"))))
## Data pre proccessing
print(dim(df))
## [1] 1000 8
print(glimpse(df))
## Rows: 1,000
## Columns: 8
## $ date <chr> "01/07", "01/07", "01/07", "01/07", "01/07", "01/07"~
## $ gender <fct> F, F, F, M, M, M, F, M, M, M, M, F, F, F, M, M, M, M~
## $ age <int> 28, 32, 39, 52, 25, 51, 53, 42, 41, 20, 26, 25, 43, ~
## $ social_metric <int> 5, 7, 4, 10, 1, 0, 5, 6, 8, 7, 9, 5, 1, 10, 4, 9, 9,~
## $ time_since_signup <dbl> 19.3, 11.5, 4.3, 9.5, 19.5, 22.6, 4.2, 8.5, 16.9, 23~
## $ demographic <int> 1, 1, 3, 4, 2, 4, 3, 4, 4, 2, 2, 1, 3, 1, 4, 4, 4, 4~
## $ group <fct> A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A~
## $ hours_watched <dbl> 4.08, 2.99, 5.74, 4.13, 4.68, 3.40, 3.07, 2.77, 2.24~
## # A tibble: 1,000 x 8
## date gender age social_metric time_since_signup demographic group
## <chr> <fct> <int> <int> <dbl> <int> <fct>
## 1 01/07 F 28 5 19.3 1 A
## 2 01/07 F 32 7 11.5 1 A
## 3 01/07 F 39 4 4.3 3 A
## 4 01/07 M 52 10 9.5 4 A
## 5 01/07 M 25 1 19.5 2 A
## 6 01/07 M 51 0 22.6 4 A
## 7 01/07 F 53 5 4.2 3 A
## 8 01/07 M 42 6 8.5 4 A
## 9 01/07 M 41 8 16.9 4 A
## 10 01/07 M 20 7 23 2 A
## # ... with 990 more rows, and 1 more variable: hours_watched <dbl>
print(levels(df$gender))
## [1] "M" "F"
print(levels(df$group))
## [1] "A" "B"
colnames(df)
## [1] "date" "gender" "age"
## [4] "social_metric" "time_since_signup" "demographic"
## [7] "group" "hours_watched"
#summary
summary(df)
## date gender age social_metric time_since_signup
## Length:1000 M:571 Min. :18.00 Min. : 0.000 Min. : 0.00
## Class :character F:429 1st Qu.:28.00 1st Qu.: 2.000 1st Qu.: 5.70
## Mode :character Median :36.00 Median : 5.000 Median :11.80
## Mean :36.49 Mean : 4.911 Mean :11.97
## 3rd Qu.:46.00 3rd Qu.: 8.000 3rd Qu.:18.70
## Max. :55.00 Max. :10.000 Max. :24.00
## demographic group hours_watched
## Min. :1.000 A:880 Min. :0.500
## 1st Qu.:2.000 B:120 1st Qu.:3.530
## Median :3.000 Median :4.415
## Mean :2.603 Mean :4.393
## 3rd Qu.:4.000 3rd Qu.:5.322
## Max. :4.000 Max. :8.300
sumstat<- df%>%group_by(group)%>%summarise(mean(hours_watched),median(hours_watched),min(hours_watched), max(hours_watched), sd(hours_watched))
print(sumstat)
## # A tibble: 2 x 6
## group `mean(hours_watch~ `median(hours_wa~ `min(hours_watch~ `max(hours_watch~
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 A 4.34 4.36 0.5 8.3
## 2 B 4.81 4.86 1.52 7.93
## # ... with 1 more variable: sd(hours_watched) <dbl>
## checking for correlation between hours_watched and all other vairiables
meanhrs_age<-aggregate(hours_watched~age,df,mean)
meanhrs_group<-aggregate(hours_watched~group,df,mean)
meanhrs_gender<-aggregate(hours_watched~gender,df,mean)
meanhrs_metric<-aggregate(hours_watched~social_metric,df,mean)
meanhrs_singup<-aggregate(hours_watched~time_since_signup,df,mean)
print(ggplot(meanhrs_age,
aes(x = age,
y = hours_watched)) +
labs( title="Average Daily Hours Watched against age - Regression model",
x = "age",
y = "Average Hours") +
geom_point() +
stat_smooth(method = lm))

ggpairs(meanhrs_age, title="correlation between Hours watched and age")

print(ggplot(meanhrs_metric,
aes(x = social_metric,
y = hours_watched)) +
labs( title="Average Daily Hours Watched against soical metric - Regression model",
x = "Social Metric",
y = "Average Hours") +
geom_point() +
stat_smooth(method = lm))

print(ggpairs(meanhrs_metric, title="correlation between Hours watched and social metric"))

print(ggplot(meanhrs_singup,
aes(x = time_since_signup,
y = hours_watched)) +
labs( title="Average Daily Hours Watched against time since singnup - Regression model",
x = "Time Since Signup ",
y = "Average Hours") +
geom_point() +
stat_smooth(method = lm))

print(ggpairs(meanhrs_singup, title="correlation between Hours watched and time since signup"))

gender_boxplot <- ggplot(df, aes(x=gender, y=hours_watched)) +
geom_boxplot()
print(gender_boxplot)

grp_boxplot <- ggplot(df, aes(x=group, y=hours_watched)) +
geom_boxplot()
print(grp_boxplot)
