The packages we need

library(tidyverse) # all tidyverse packages
## Warning: le package 'tidyverse' a été compilé avec la version R 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.3     v forcats 0.5.1
## Warning: le package 'readr' a été compilé avec la version R 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(purrr)
library(lubridate)
## 
## Attachement du package : 'lubridate'
## Les objets suivants sont masqués depuis 'package:base':
## 
##     date, intersect, setdiff, union
library(wordcloud)
## Warning: le package 'wordcloud' a été compilé avec la version R 4.1.3
## Le chargement a nécessité le package : RColorBrewer
## Warning: le package 'RColorBrewer' a été compilé avec la version R 4.1.3
library(RColorBrewer)
library(data.table)
## 
## Attachement du package : 'data.table'
## Les objets suivants sont masqués depuis 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## Les objets suivants sont masqués depuis 'package:dplyr':
## 
##     between, first, last
## L'objet suivant est masqué depuis 'package:purrr':
## 
##     transpose
library(wordcloud)
library(tm)
## Warning: le package 'tm' a été compilé avec la version R 4.1.3
## Le chargement a nécessité le package : NLP
## 
## Attachement du package : 'NLP'
## L'objet suivant est masqué depuis 'package:ggplot2':
## 
##     annotate

Read the data

View the dataframe created

mrbeastdataframe

Use the str function to summrize the data and the type of each colomn

str(mrbeastdataframe)
## 'data.frame':    1551 obs. of  12 variables:
##  $ X               : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ id              : chr  "08wlrlPjWeE" "5rkMxc7yGSk" "7gYpJotNUEs" "aSQUg-h8G4s" ...
##  $ title           : chr  "We Built Houses for Homeless Families" "Feeding a City in Need" "We Gave $3,000,000 of Aid to Ukrainian Refugees!" "We Built Wells in Africa!" ...
##  $ publication_date: chr  "2022-07-28" "2021-04-05" "2022-06-18" "2022-05-21" ...
##  $ description     : chr  "Download Monster Legends now and go get my monsters! https://monsterlegends.onelink.me/X4Sc/beastphilanthropy\n"| __truncated__ "THIS IS MY FAVORITE VIDEO EVER! Thank you so much for watching this channel and allowing us to grow this charit"| __truncated__ "Go to our sponsor https://betterhelp.com/beastphilanthropy for 10% off your first month of therapy with BetterH"| __truncated__ "Download Dragon City for free, collect all MrBeast Dragons and claim the special rewards: https://dragoncity.on"| __truncated__ ...
##  $ channel_id      : chr  "UCAiLfjNXkNv24uhpzUgPa6A" "UCAiLfjNXkNv24uhpzUgPa6A" "UCAiLfjNXkNv24uhpzUgPa6A" "UCAiLfjNXkNv24uhpzUgPa6A" ...
##  $ channel_title   : chr  "Beast Philanthropy" "Beast Philanthropy" "Beast Philanthropy" "Beast Philanthropy" ...
##  $ viewCount       : int  9929978 7655799 9312039 24407740 4027048 6312862 15315917 5962100 13432363 4420165 ...
##  $ likeCount       : int  606477 755297 636795 1389776 302451 771142 869325 364730 638754 281479 ...
##  $ favoriteCount   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ commentCount    : int  14993 37239 53769 43244 9368 37456 38699 7947 26511 11394 ...
##  $ url             : chr  "https://www.youtube.com/watch?v=08wlrlPjWeE" "https://www.youtube.com/watch?v=5rkMxc7yGSk" "https://www.youtube.com/watch?v=7gYpJotNUEs" "https://www.youtube.com/watch?v=aSQUg-h8G4s" ...

The str() give us the colomns of the data frame and the type of each variable

we have 1551 observation with 12 variables

Cleaning the data

changing the type of publish_date from string to date using lubridate package

mrbeastdataframe %>%  
  mutate(publication_date=lubridate::as_date(publication_date)) -> mrbeastdataframe

changing the type of the counts from string to numeric

mrbeastdataframe %>%  
  mutate(viewCount=as.numeric(viewCount),
         commentCount=as.numeric(commentCount),
         likeCount=as.numeric(likeCount),
         favoriteCount =as.numeric(favoriteCount)
         ) -> mrbeastdataframe

creat a year colomn

mrbeastdataframe %>%  
  mutate(year=year(publication_date)) -> mrbeastdataframe

creat a year_mounth colomn

mrbeastdataframe %>%  
  
  mutate(year_month = paste(strftime(mrbeastdataframe$publication_date, "%Y"),
                              "-",
                              strftime(mrbeastdataframe$publication_date, "%m"),
                              paste("(",strftime(mrbeastdataframe$publication_date, "%b"), ")", sep=""))) ->mrbeastdataframe

make a summary to the new dataframe

mrbeastdataframe  %>%  
  summary()
##        X               id               title           publication_date    
##  Min.   :   1.0   Length:1551        Length:1551        Min.   :2012-02-20  
##  1st Qu.: 388.5   Class :character   Class :character   1st Qu.:2016-02-12  
##  Median : 776.0   Mode  :character   Mode  :character   Median :2020-01-17  
##  Mean   : 776.0                                         Mean   :2019-04-07  
##  3rd Qu.:1163.5                                         3rd Qu.:2021-12-11  
##  Max.   :1551.0                                         Max.   :2022-12-04  
##                                                                             
##  description         channel_id        channel_title        viewCount        
##  Length:1551        Length:1551        Length:1551        Min.   :        0  
##  Class :character   Class :character   Class :character   1st Qu.:   344262  
##  Mode  :character   Mode  :character   Mode  :character   Median :  6550691  
##                                                           Mean   : 21421069  
##                                                           3rd Qu.: 27484713  
##                                                           Max.   :311873265  
##                                                                              
##    likeCount        favoriteCount  commentCount           url           
##  Min.   :      61   Min.   :0     Min.   :      0.0   Length:1551       
##  1st Qu.:   12702   1st Qu.:0     1st Qu.:    874.2   Class :character  
##  Median :  199591   Median :0     Median :   4326.0   Mode  :character  
##  Mean   :  599293   Mean   :0     Mean   :  23406.3                     
##  3rd Qu.:  713609   3rd Qu.:0     3rd Qu.:  25159.2                     
##  Max.   :20832736   Max.   :0     Max.   :1909751.0                     
##  NA's   :7                        NA's   :1                             
##       year       year_month       
##  Min.   :2012   Length:1551       
##  1st Qu.:2016   Class :character  
##  Median :2020   Mode  :character  
##  Mean   :2019                     
##  3rd Qu.:2021                     
##  Max.   :2022                     
## 

STARTING THE ANALYSIS PHASE

The total views for all Mrbeast channels

total_views=sum(mrbeastdataframe$viewCount)

total views

total_views
## [1] 33224077552
ifelse(grepl('%', total_views), total_views, scales::comma(as.numeric(total_views), big = ' '))
## [1] "33 224 077 552"

#Main channel ####we use the filter to get the date related to the main channel MrBeast

mrbeastdataframe %>%  
  filter(channel_title=="MrBeast") ->mrbeast_main_channel

summmrize the data

summary(mrbeast_main_channel)
##        X              id               title           publication_date    
##  Min.   : 18.0   Length:731         Length:731         Min.   :2012-02-20  
##  1st Qu.:200.5   Class :character   Class :character   1st Qu.:2015-04-16  
##  Median :383.0   Mode  :character   Mode  :character   Median :2015-12-23  
##  Mean   :383.0                                         Mean   :2016-09-30  
##  3rd Qu.:565.5                                         3rd Qu.:2018-08-10  
##  Max.   :748.0                                         Max.   :2022-12-03  
##                                                                            
##  description         channel_id        channel_title        viewCount        
##  Length:731         Length:731         Length:731         Min.   :    23721  
##  Class :character   Class :character   Class :character   1st Qu.:    51906  
##  Mode  :character   Mode  :character   Mode  :character   Median :   268800  
##                                                           Mean   : 26310059  
##                                                           3rd Qu.: 36587012  
##                                                           Max.   :311873265  
##                                                                              
##    likeCount        favoriteCount  commentCount          url           
##  Min.   :     500   Min.   :0     Min.   :    81.0   Length:731        
##  1st Qu.:    1783   1st Qu.:0     1st Qu.:   291.5   Class :character  
##  Median :   10665   Median :0     Median :  1251.5   Mode  :character  
##  Mean   :  781997   Mean   :0     Mean   : 31157.3                     
##  3rd Qu.: 1059317   3rd Qu.:0     3rd Qu.: 44323.8                     
##  Max.   :20832736   Max.   :0     Max.   :732669.0                     
##  NA's   :7                        NA's   :1                            
##       year       year_month       
##  Min.   :2012   Length:731        
##  1st Qu.:2015   Class :character  
##  Median :2015   Mode  :character  
##  Mean   :2016                     
##  3rd Qu.:2018                     
##  Max.   :2022                     
## 

total views of the main channel

creat a function rewrite to get write that put space between each 3 degit

rewrite <- function(i) {
   ifelse(grepl('%', i), i, scales::comma(as.numeric(i), big = ' '))

} 
mrbeast_main_channel$viewCount %>% 
  sum() -> total_views_of_the_main_channel
rewrite(total_views_of_the_main_channel)
## [1] "19 232 652 811"

% of the the main channel from the total views

total_views_of_the_main_channel/total_views*100
## [1] 57.8877

####we can see the more then half of the views are coming from the main channel

mrbeast_main_channel$viewCount  %>% 
  max() -> the_best 
rewrite(the_best)
## [1] "311 873 265"
the_best/total_views_of_the_main_channel*100
## [1] 1.621582

views grouped by year

mrbeast_main_channel %>%  
  group_by(year) %>%  
  summarise(totaal=rewrite(sum(viewCount)))

see the most used word in the titles :

mrbeast_main_channel %>%  
  select(title) ->titles 
titles <- as.data.table(titles)

text <-titles
docs <- Corpus(VectorSource(text))

docs <- docs %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
docs <- tm_map(docs, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs, content_transformer(tolower)):
## transformation drops documents
docs <- tm_map(docs, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(docs, removeWords, stopwords("english")):
## transformation drops documents
dtm <- TermDocumentMatrix(docs) 
matrix <- as.matrix(dtm) 
words <- sort(rowSums(matrix),decreasing=TRUE) 
df <- data.frame(word = names(words),freq=words)
set.seed(1234) # for reproducibility 
wordcloud(words = df$word, freq = df$freq, min.freq = 3,           max.words=100, random.order=FALSE, rot.per=0.350,            colors=brewer.pal(8, "Paired"))