Post by: Edward Perez “Many YouTubers (influencers) can make famous or destroy any brand here I found a dataset that can help see what they are saying.

https://www.kaggle.com/praneshmukhopadhyay/youtubers-saying-things

Some possible analyses are

How big their audience is how many views who has the most view often”


1. Import libraries and data

library(tidyverse)
library(readr)
library(curl)
library(ggplot2)
library(dplyr)
library(lubridate)
library(stringr)
library(scales)
library(httr)
#youtube <- read.csv("youtube.csv")
youtube <- read.csv(curl("https://raw.githubusercontent.com/brsingh7/DATA607/main/Week6/Project2B/youtube.csv"))

2. Tidy data

#Separate numbers from subscribers variable
youtube2 <- youtube %>%
  separate(Subscribers,into=c("Num_Subscribers"),sep=" +") %>%
  separate(Views,into=c("Num_Views"),sep=" +")

#convert numbers with "K" "M" to numbers
youtube2$Num_Subscribers <- as.numeric(sub("^(\\d+\\.?\\d*).*$", "\\1", youtube2$Num_Subscribers)) *
  ifelse(grepl("K", youtube2$Num_Subscribers), 1000, 1) * 
  ifelse(grepl("M", youtube2$Num_Subscribers), 1e6, 1)

youtube2$Num_Views <- as.numeric(sub("^(\\d+\\.?\\d*).*$", "\\1", youtube2$Num_Views)) *
  ifelse(grepl("K", youtube2$Num_Views), 1000, 1) * 
  ifelse(grepl("M", youtube2$Num_Views), 1e6, 1)

youtube2$Released[youtube2$Released==""] <- "Unknown"
#Convert the released column (contains "2 years ago, 3 years ago, 6 months ago, etc.") to a year variable.
today <- ymd((as.Date(now())))
youtube2$ReleaseYr <- ifelse(grepl("years",youtube2$Released,fixed=TRUE),year(today - years(as.numeric(sub("^(\\d+\\.?\\d*).*$", "\\1", youtube2$Released)))),ifelse(grepl("months",youtube2$Released,fixed=TRUE),year(today - months(as.numeric(sub("^(\\d+\\.?\\d*).*$", "\\1", youtube2$Released)))),"UNKNOWN"))
## Warning in .period_from_units(list(...)): NAs introduced by coercion
## Warning in months(as.numeric(sub("^(\\d+\\.?\\d*).*$", "\\1",
## youtube2$Released))): NAs introduced by coercion

3. Data analysis

#Create a table to summarize by youtube channel, total views, total subscribers and average views per subscriber. Sort by total subscribers.
youtube2 %>%
    group_by(Channel) %>%
    summarise(total_subscribers = sum(Num_Subscribers),
              total_views = sum(Num_Views),
              avg_views_per_subscriber = total_views/total_subscribers)%>%
    arrange(desc(total_subscribers))
## # A tibble: 91 × 4
##    Channel                    total_subscribers total_views avg_views_per_subsc…
##    <chr>                                  <dbl>       <dbl>                <dbl>
##  1 MrBeast                            986700000  1313000000                1.33 
##  2 Markiplier                         382800000   885000000                2.31 
##  3 jacksepticeye                      364000000   301000000                0.827
##  4 BuzzFeedVideo                      243600000   397000000                1.63 
##  5 Kurzgesagt – In a Nutshell         230100000   304000000                1.32 
##  6 Gordon Ramsay                      224400000   433000000                1.93 
##  7 The Infographics Show              192100000   176800000                0.920
##  8 VICE                               183600000   303000000                1.65 
##  9 Vsauce                             174000000   230000000                1.32 
## 10 penguinz0                          149850000   161700000                1.08 
## # … with 81 more rows
#Create a table to summarize by youtube channel, total views, total subscribers and average views per subscriber. Sort by average views per subscriber.
youtube2 %>%
    group_by(Channel) %>%
    summarise(total_subscribers = sum(Num_Subscribers),
              total_views = sum(Num_Views),
              avg_views_per_subscriber = total_views/total_subscribers)%>%
    arrange(desc(avg_views_per_subscriber))
## # A tibble: 91 × 4
##    Channel                   total_subscribers total_views avg_views_per_subscr…
##    <chr>                                 <dbl>       <dbl>                 <dbl>
##  1 Parks and Recreation                1974000    23200000                 11.8 
##  2 Insider News                        2808000    24660000                  8.78
##  3 The Stig                            2460000    18197000                  7.40
##  4 Hell's Kitchen                     13500000    73400000                  5.44
##  5 JCS - Criminal Psychology          14370000    75400000                  5.25
##  6 Key & Peele                        46930000   243000000                  5.18
##  7 NowThis News                       20760000   106200000                  5.12
##  8 Epicurious                         48120000   203700000                  4.23
##  9 The Office                         43520000   181100000                  4.16
## 10 Brooklyn Nine-Nine                 10164000    41800000                  4.11
## # … with 81 more rows
#Plot top 10 channels by total subscribers
youtube2 %>% 
    group_by(Channel) %>%
    summarise(total_subscribers = sum(Num_Subscribers),
              total_views = sum(Num_Views),
              avg_views_per_subscriber = total_views/total_subscribers) %>%
    arrange(desc(total_subscribers)) %>%
    slice(1:10) %>%
    ggplot(aes(x=reorder(Channel,total_subscribers),y=total_subscribers)) +
    geom_bar(position = "dodge",
        stat = "summary",
        fun = "mean") +
    ggtitle("Top 10 Youtube Channels by Total Views") + xlab("Channel") + ylab("Total Views") +
    scale_y_continuous(labels = label_number(suffix = " M", scale = 1e-6))+
    coord_flip()

#Plot average views per subscriber for each channel
youtube2 %>% 
    group_by(Channel) %>%
    summarise(total_subscribers = sum(Num_Subscribers),
              total_views = sum(Num_Views),
              avg_views_per_subscriber = total_views/total_subscribers) %>%
    ggplot(aes(x=total_subscribers, y=avg_views_per_subscriber))+
    geom_point() +
    scale_x_continuous(labels = label_number(suffix = " K", scale = 1e-3))

#Create histogram to look at frequency of views by Category
youtube2 %>%
    group_by(Category) %>%
    summarise(count = n()) %>%
    ggplot(aes(x=reorder(Category,(count)), y=count)) +
    geom_bar(stat='identity') +
    coord_flip()

4. Conclusion

Based on the data, the Mr. Beast channel has the highest number of subscribers as well as the most views since 2009 of any channels, with over 2.6B subscribers and 3.6B views. However, total subscribers does not necessarily mean it is the most popular. In looking at the scatter plot, there is no direct correlation between the total subscribers and average views per subscriber. In fact, the Mr. Beast channel is not even in the top 60 of average view per subscriber, averaging a little over 1 view per subscriber. The channel with the most frequent views per subscriber is Parks and Recreation, with 10.8 views per subscriber.