library(tidyverse) # all tidyverse packages
## Warning: le package 'tidyverse' a été compilé avec la version R 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.3 v forcats 0.5.1
## Warning: le package 'readr' a été compilé avec la version R 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(purrr)
library(lubridate)
##
## Attachement du package : 'lubridate'
## Les objets suivants sont masqués depuis 'package:base':
##
## date, intersect, setdiff, union
library(wordcloud)
## Warning: le package 'wordcloud' a été compilé avec la version R 4.1.3
## Le chargement a nécessité le package : RColorBrewer
## Warning: le package 'RColorBrewer' a été compilé avec la version R 4.1.3
library(RColorBrewer)
library(data.table)
##
## Attachement du package : 'data.table'
## Les objets suivants sont masqués depuis 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## Les objets suivants sont masqués depuis 'package:dplyr':
##
## between, first, last
## L'objet suivant est masqué depuis 'package:purrr':
##
## transpose
library(wordcloud)
library(tm)
## Warning: le package 'tm' a été compilé avec la version R 4.1.3
## Le chargement a nécessité le package : NLP
##
## Attachement du package : 'NLP'
## L'objet suivant est masqué depuis 'package:ggplot2':
##
## annotate
mrbeastdataframe
str(mrbeastdataframe)
## 'data.frame': 1551 obs. of 12 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ id : chr "08wlrlPjWeE" "5rkMxc7yGSk" "7gYpJotNUEs" "aSQUg-h8G4s" ...
## $ title : chr "We Built Houses for Homeless Families" "Feeding a City in Need" "We Gave $3,000,000 of Aid to Ukrainian Refugees!" "We Built Wells in Africa!" ...
## $ publication_date: chr "2022-07-28" "2021-04-05" "2022-06-18" "2022-05-21" ...
## $ description : chr "Download Monster Legends now and go get my monsters! https://monsterlegends.onelink.me/X4Sc/beastphilanthropy\n"| __truncated__ "THIS IS MY FAVORITE VIDEO EVER! Thank you so much for watching this channel and allowing us to grow this charit"| __truncated__ "Go to our sponsor https://betterhelp.com/beastphilanthropy for 10% off your first month of therapy with BetterH"| __truncated__ "Download Dragon City for free, collect all MrBeast Dragons and claim the special rewards: https://dragoncity.on"| __truncated__ ...
## $ channel_id : chr "UCAiLfjNXkNv24uhpzUgPa6A" "UCAiLfjNXkNv24uhpzUgPa6A" "UCAiLfjNXkNv24uhpzUgPa6A" "UCAiLfjNXkNv24uhpzUgPa6A" ...
## $ channel_title : chr "Beast Philanthropy" "Beast Philanthropy" "Beast Philanthropy" "Beast Philanthropy" ...
## $ viewCount : int 9929978 7655799 9312039 24407740 4027048 6312862 15315917 5962100 13432363 4420165 ...
## $ likeCount : int 606477 755297 636795 1389776 302451 771142 869325 364730 638754 281479 ...
## $ favoriteCount : int 0 0 0 0 0 0 0 0 0 0 ...
## $ commentCount : int 14993 37239 53769 43244 9368 37456 38699 7947 26511 11394 ...
## $ url : chr "https://www.youtube.com/watch?v=08wlrlPjWeE" "https://www.youtube.com/watch?v=5rkMxc7yGSk" "https://www.youtube.com/watch?v=7gYpJotNUEs" "https://www.youtube.com/watch?v=aSQUg-h8G4s" ...
mrbeastdataframe %>%
mutate(publication_date=lubridate::as_date(publication_date)) -> mrbeastdataframe
mrbeastdataframe %>%
mutate(viewCount=as.numeric(viewCount),
commentCount=as.numeric(commentCount),
likeCount=as.numeric(likeCount),
favoriteCount =as.numeric(favoriteCount)
) -> mrbeastdataframe
mrbeastdataframe %>%
mutate(year=year(publication_date)) -> mrbeastdataframe
mrbeastdataframe %>%
mutate(year_month = paste(strftime(mrbeastdataframe$publication_date, "%Y"),
"-",
strftime(mrbeastdataframe$publication_date, "%m"),
paste("(",strftime(mrbeastdataframe$publication_date, "%b"), ")", sep=""))) ->mrbeastdataframe
mrbeastdataframe %>%
summary()
## X id title publication_date
## Min. : 1.0 Length:1551 Length:1551 Min. :2012-02-20
## 1st Qu.: 388.5 Class :character Class :character 1st Qu.:2016-02-12
## Median : 776.0 Mode :character Mode :character Median :2020-01-17
## Mean : 776.0 Mean :2019-04-07
## 3rd Qu.:1163.5 3rd Qu.:2021-12-11
## Max. :1551.0 Max. :2022-12-04
##
## description channel_id channel_title viewCount
## Length:1551 Length:1551 Length:1551 Min. : 0
## Class :character Class :character Class :character 1st Qu.: 344262
## Mode :character Mode :character Mode :character Median : 6550691
## Mean : 21421069
## 3rd Qu.: 27484713
## Max. :311873265
##
## likeCount favoriteCount commentCount url
## Min. : 61 Min. :0 Min. : 0.0 Length:1551
## 1st Qu.: 12702 1st Qu.:0 1st Qu.: 874.2 Class :character
## Median : 199591 Median :0 Median : 4326.0 Mode :character
## Mean : 599293 Mean :0 Mean : 23406.3
## 3rd Qu.: 713609 3rd Qu.:0 3rd Qu.: 25159.2
## Max. :20832736 Max. :0 Max. :1909751.0
## NA's :7 NA's :1
## year year_month
## Min. :2012 Length:1551
## 1st Qu.:2016 Class :character
## Median :2020 Mode :character
## Mean :2019
## 3rd Qu.:2021
## Max. :2022
##
total_views=sum(mrbeastdataframe$viewCount)
total_views
## [1] 33224077552
ifelse(grepl('%', total_views), total_views, scales::comma(as.numeric(total_views), big = ' '))
## [1] "33 224 077 552"
#Main channel ####we use the filter to get the date related to the main channel MrBeast
mrbeastdataframe %>%
filter(channel_title=="MrBeast") ->mrbeast_main_channel
summary(mrbeast_main_channel)
## X id title publication_date
## Min. : 18.0 Length:731 Length:731 Min. :2012-02-20
## 1st Qu.:200.5 Class :character Class :character 1st Qu.:2015-04-16
## Median :383.0 Mode :character Mode :character Median :2015-12-23
## Mean :383.0 Mean :2016-09-30
## 3rd Qu.:565.5 3rd Qu.:2018-08-10
## Max. :748.0 Max. :2022-12-03
##
## description channel_id channel_title viewCount
## Length:731 Length:731 Length:731 Min. : 23721
## Class :character Class :character Class :character 1st Qu.: 51906
## Mode :character Mode :character Mode :character Median : 268800
## Mean : 26310059
## 3rd Qu.: 36587012
## Max. :311873265
##
## likeCount favoriteCount commentCount url
## Min. : 500 Min. :0 Min. : 81.0 Length:731
## 1st Qu.: 1783 1st Qu.:0 1st Qu.: 291.5 Class :character
## Median : 10665 Median :0 Median : 1251.5 Mode :character
## Mean : 781997 Mean :0 Mean : 31157.3
## 3rd Qu.: 1059317 3rd Qu.:0 3rd Qu.: 44323.8
## Max. :20832736 Max. :0 Max. :732669.0
## NA's :7 NA's :1
## year year_month
## Min. :2012 Length:731
## 1st Qu.:2015 Class :character
## Median :2015 Mode :character
## Mean :2016
## 3rd Qu.:2018
## Max. :2022
##
rewrite <- function(i) {
ifelse(grepl('%', i), i, scales::comma(as.numeric(i), big = ' '))
}
mrbeast_main_channel$viewCount %>%
sum() -> total_views_of_the_main_channel
rewrite(total_views_of_the_main_channel)
## [1] "19 232 652 811"
total_views_of_the_main_channel/total_views*100
## [1] 57.8877
####we can see the more then half of the views are coming from the main channel
mrbeast_main_channel$viewCount %>%
max() -> the_best
rewrite(the_best)
## [1] "311 873 265"
the_best/total_views_of_the_main_channel*100
## [1] 1.621582
mrbeast_main_channel %>%
group_by(year) %>%
summarise(totaal=rewrite(sum(viewCount)))
mrbeast_main_channel %>%
select(title) ->titles
titles <- as.data.table(titles)
text <-titles
docs <- Corpus(VectorSource(text))
docs <- docs %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
docs <- tm_map(docs, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs, content_transformer(tolower)):
## transformation drops documents
docs <- tm_map(docs, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(docs, removeWords, stopwords("english")):
## transformation drops documents
dtm <- TermDocumentMatrix(docs)
matrix <- as.matrix(dtm)
words <- sort(rowSums(matrix),decreasing=TRUE)
df <- data.frame(word = names(words),freq=words)
set.seed(1234) # for reproducibility
wordcloud(words = df$word, freq = df$freq, min.freq = 3, max.words=100, random.order=FALSE, rot.per=0.350, colors=brewer.pal(8, "Paired"))