Assessment 1

df <- read_csv("C:/Users/61423/Downloads/streaming_data.csv", 
                  col_types = cols(gender = col_factor(levels = c("M","F")), 
                  age = col_integer(), social_metric = col_integer(),demographic = col_integer(), 
                  group = col_factor(levels = c("A","B"))))

## Data pre proccessing
print(dim(df))

## [1] 1000    8

print(glimpse(df))

## Rows: 1,000
## Columns: 8
## $ date              <chr> "01/07", "01/07", "01/07", "01/07", "01/07", "01/07"~
## $ gender            <fct> F, F, F, M, M, M, F, M, M, M, M, F, F, F, M, M, M, M~
## $ age               <int> 28, 32, 39, 52, 25, 51, 53, 42, 41, 20, 26, 25, 43, ~
## $ social_metric     <int> 5, 7, 4, 10, 1, 0, 5, 6, 8, 7, 9, 5, 1, 10, 4, 9, 9,~
## $ time_since_signup <dbl> 19.3, 11.5, 4.3, 9.5, 19.5, 22.6, 4.2, 8.5, 16.9, 23~
## $ demographic       <int> 1, 1, 3, 4, 2, 4, 3, 4, 4, 2, 2, 1, 3, 1, 4, 4, 4, 4~
## $ group             <fct> A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A~
## $ hours_watched     <dbl> 4.08, 2.99, 5.74, 4.13, 4.68, 3.40, 3.07, 2.77, 2.24~
## # A tibble: 1,000 x 8
##    date  gender   age social_metric time_since_signup demographic group
##    <chr> <fct>  <int>         <int>             <dbl>       <int> <fct>
##  1 01/07 F         28             5              19.3           1 A    
##  2 01/07 F         32             7              11.5           1 A    
##  3 01/07 F         39             4               4.3           3 A    
##  4 01/07 M         52            10               9.5           4 A    
##  5 01/07 M         25             1              19.5           2 A    
##  6 01/07 M         51             0              22.6           4 A    
##  7 01/07 F         53             5               4.2           3 A    
##  8 01/07 M         42             6               8.5           4 A    
##  9 01/07 M         41             8              16.9           4 A    
## 10 01/07 M         20             7              23             2 A    
## # ... with 990 more rows, and 1 more variable: hours_watched <dbl>

print(levels(df$gender))

## [1] "M" "F"

print(levels(df$group))

## [1] "A" "B"

colnames(df)

## [1] "date"              "gender"            "age"              
## [4] "social_metric"     "time_since_signup" "demographic"      
## [7] "group"             "hours_watched"

#summary
summary(df)

##      date           gender       age        social_metric    time_since_signup
##  Length:1000        M:571   Min.   :18.00   Min.   : 0.000   Min.   : 0.00    
##  Class :character   F:429   1st Qu.:28.00   1st Qu.: 2.000   1st Qu.: 5.70    
##  Mode  :character           Median :36.00   Median : 5.000   Median :11.80    
##                             Mean   :36.49   Mean   : 4.911   Mean   :11.97    
##                             3rd Qu.:46.00   3rd Qu.: 8.000   3rd Qu.:18.70    
##                             Max.   :55.00   Max.   :10.000   Max.   :24.00    
##   demographic    group   hours_watched  
##  Min.   :1.000   A:880   Min.   :0.500  
##  1st Qu.:2.000   B:120   1st Qu.:3.530  
##  Median :3.000           Median :4.415  
##  Mean   :2.603           Mean   :4.393  
##  3rd Qu.:4.000           3rd Qu.:5.322  
##  Max.   :4.000           Max.   :8.300

sumstat<- df%>%group_by(group)%>%summarise(mean(hours_watched),median(hours_watched),min(hours_watched), max(hours_watched), sd(hours_watched))
print(sumstat)

## # A tibble: 2 x 6
##   group `mean(hours_watch~ `median(hours_wa~ `min(hours_watch~ `max(hours_watch~
##   <fct>              <dbl>             <dbl>             <dbl>             <dbl>
## 1 A                   4.34              4.36              0.5               8.3 
## 2 B                   4.81              4.86              1.52              7.93
## # ... with 1 more variable: sd(hours_watched) <dbl>

## checking for correlation between hours_watched and all other vairiables

meanhrs_age<-aggregate(hours_watched~age,df,mean)

meanhrs_group<-aggregate(hours_watched~group,df,mean)

meanhrs_gender<-aggregate(hours_watched~gender,df,mean)

meanhrs_metric<-aggregate(hours_watched~social_metric,df,mean)

meanhrs_singup<-aggregate(hours_watched~time_since_signup,df,mean)

print(ggplot(meanhrs_age,
       aes(x = age,
           y = hours_watched)) +
  labs( title="Average Daily Hours Watched against age - Regression model",
           x = "age",
           y = "Average Hours") +
  geom_point() +
  stat_smooth(method = lm))

ggpairs(meanhrs_age, title="correlation between Hours watched and age")

print(ggplot(meanhrs_metric,
       aes(x = social_metric,
           y = hours_watched)) +
  labs( title="Average Daily Hours Watched against soical metric - Regression model",
           x = "Social Metric",
           y = "Average Hours") +
  geom_point() +
  stat_smooth(method = lm))

print(ggpairs(meanhrs_metric, title="correlation between Hours watched and social metric"))

print(ggplot(meanhrs_singup,
       aes(x = time_since_signup,
           y = hours_watched)) +
  labs( title="Average Daily Hours Watched against time since singnup - Regression model",
           x = "Time Since Signup ",
           y = "Average Hours") +
  geom_point() +
  stat_smooth(method = lm))

print(ggpairs(meanhrs_singup, title="correlation between Hours watched and time since signup"))

gender_boxplot <- ggplot(df, aes(x=gender, y=hours_watched)) + 
  geom_boxplot()

print(gender_boxplot)

grp_boxplot <- ggplot(df, aes(x=group, y=hours_watched)) + 
  geom_boxplot()

print(grp_boxplot)

Assessment 1

Tyson Aviles s3920384

24/02/2022