Boasting 2 billion monthly active users, YouTube us the second highest visited sire with 2 billion active monthly users. At 335 million monthly active users, Twitter is also one of the most used social networking platforms in the world. With the release of the Playstation 5 and Xbox series X, I wanted to leverage the interactions from both these platforms to understand the reception each console is getting. From my the people I interact with, the Playstation console always seems to have the most favorable reviews compared the Xbox.

Link to my Github for datasets

Required packages:

Data collected from Twitter and Youtube (YouTube video comparing both products from popular YouTuber):

xbox_tw<- read_csv("xboxTweets.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   text = col_character(),
##   favorited = col_logical(),
##   favoriteCount = col_double(),
##   replyToSN = col_character(),
##   created = col_datetime(format = ""),
##   truncated = col_logical(),
##   replyToSID = col_double(),
##   id = col_double(),
##   replyToUID = col_double(),
##   statusSource = col_character(),
##   screenName = col_character(),
##   retweetCount = col_double(),
##   isRetweet = col_logical(),
##   retweeted = col_logical(),
##   longitude = col_logical(),
##   latitude = col_logical()
## )
ps_tw<- read_csv("ps5Tweets.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   text = col_character(),
##   favorited = col_logical(),
##   favoriteCount = col_double(),
##   replyToSN = col_character(),
##   created = col_datetime(format = ""),
##   truncated = col_logical(),
##   replyToSID = col_double(),
##   id = col_double(),
##   replyToUID = col_double(),
##   statusSource = col_character(),
##   screenName = col_character(),
##   retweetCount = col_double(),
##   isRetweet = col_logical(),
##   retweeted = col_logical(),
##   longitude = col_logical(),
##   latitude = col_logical()
## )
yt <- read_csv("ytcomments.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   videoId = col_character(),
##   textDisplay = col_character(),
##   textOriginal = col_character(),
##   authorDisplayName = col_character(),
##   authorProfileImageUrl = col_character(),
##   authorChannelUrl = col_character(),
##   authorChannelId.value = col_character(),
##   canRate = col_logical(),
##   viewerRating = col_character(),
##   likeCount = col_double(),
##   publishedAt = col_datetime(format = ""),
##   updatedAt = col_datetime(format = ""),
##   id = col_character(),
##   parentId = col_character(),
##   moderationStatus = col_logical()
## )

Exploring the data: - Both Twitter datasets have 17 variables, while the YouTube data has 16, however the YouTube comments on this video is about three times the size.

head(xbox_tw)
## # A tibble: 6 x 17
##      X1 text  favorited favoriteCount replyToSN created             truncated
##   <dbl> <chr> <lgl>             <dbl> <chr>     <dttm>              <lgl>    
## 1     1 "RT ~ FALSE                 0 <NA>      2020-12-03 05:30:13 FALSE    
## 2     2 "It'~ FALSE                 0 <NA>      2020-12-03 05:30:13 FALSE    
## 3     3 "RT ~ FALSE                 0 <NA>      2020-12-03 05:30:12 FALSE    
## 4     4 "RT ~ FALSE                 0 <NA>      2020-12-03 05:30:09 FALSE    
## 5     5 "RT ~ FALSE                 0 <NA>      2020-12-03 05:30:09 FALSE    
## 6     6 "I g~ FALSE                 0 <NA>      2020-12-03 05:30:07 FALSE    
## # ... with 10 more variables: replyToSID <dbl>, id <dbl>, replyToUID <dbl>,
## #   statusSource <chr>, screenName <chr>, retweetCount <dbl>, isRetweet <lgl>,
## #   retweeted <lgl>, longitude <lgl>, latitude <lgl>
head(ps_tw)
## # A tibble: 6 x 17
##      X1 text  favorited favoriteCount replyToSN created             truncated
##   <dbl> <chr> <lgl>             <dbl> <chr>     <dttm>              <lgl>    
## 1     1 "Whe~ FALSE                 0 <NA>      2020-12-03 05:36:42 FALSE    
## 2     2 "RT ~ FALSE                 0 <NA>      2020-12-03 05:36:41 FALSE    
## 3     3 "RT ~ FALSE                 0 <NA>      2020-12-03 05:36:40 FALSE    
## 4     4 "@AB~ FALSE                 0 ABC       2020-12-03 05:36:37 TRUE     
## 5     5 "RT ~ FALSE                 0 <NA>      2020-12-03 05:36:37 FALSE    
## 6     6 "Jap~ FALSE                 0 <NA>      2020-12-03 05:36:37 FALSE    
## # ... with 10 more variables: replyToSID <dbl>, id <dbl>, replyToUID <dbl>,
## #   statusSource <chr>, screenName <chr>, retweetCount <dbl>, isRetweet <lgl>,
## #   retweeted <lgl>, longitude <lgl>, latitude <lgl>
head(yt)
## # A tibble: 6 x 16
##      X1 videoId textDisplay textOriginal authorDisplayNa~ authorProfileIm~
##   <dbl> <chr>   <chr>       <chr>        <chr>            <chr>           
## 1     1 Jq-ODz~ "Which one~ "Which one ~ Unbox Therapy    https://yt3.ggp~
## 2     2 Jq-ODz~ "Way to mu~ "Way to muc~ Damian Woodward  https://yt3.ggp~
## 3     3 Jq-ODz~ "POV: You ~ "POV: You w~ sjayel518        https://yt3.ggp~
## 4     4 Jq-ODz~ "This is s~ "This is so~ Mojiece          https://yt3.ggp~
## 5     5 Jq-ODz~ "Until the~ "Until the ~ Mike Tran        https://yt3.ggp~
## 6     6 Jq-ODz~ "( No fans~ "( No fans ~ July Amar        https://yt3.ggp~
## # ... with 10 more variables: authorChannelUrl <chr>,
## #   authorChannelId.value <chr>, canRate <lgl>, viewerRating <chr>,
## #   likeCount <dbl>, publishedAt <dttm>, updatedAt <dttm>, id <chr>,
## #   parentId <chr>, moderationStatus <lgl>
dim(xbox_tw)
## [1] 5000   17
dim(ps_tw)
## [1] 5000   17
dim(yt)
## [1] 15468    16
names(xbox_tw)
##  [1] "X1"            "text"          "favorited"     "favoriteCount"
##  [5] "replyToSN"     "created"       "truncated"     "replyToSID"   
##  [9] "id"            "replyToUID"    "statusSource"  "screenName"   
## [13] "retweetCount"  "isRetweet"     "retweeted"     "longitude"    
## [17] "latitude"
names(ps_tw)
##  [1] "X1"            "text"          "favorited"     "favoriteCount"
##  [5] "replyToSN"     "created"       "truncated"     "replyToSID"   
##  [9] "id"            "replyToUID"    "statusSource"  "screenName"   
## [13] "retweetCount"  "isRetweet"     "retweeted"     "longitude"    
## [17] "latitude"
names(yt)
##  [1] "X1"                    "videoId"               "textDisplay"          
##  [4] "textOriginal"          "authorDisplayName"     "authorProfileImageUrl"
##  [7] "authorChannelUrl"      "authorChannelId.value" "canRate"              
## [10] "viewerRating"          "likeCount"             "publishedAt"          
## [13] "updatedAt"             "id"                    "parentId"             
## [16] "moderationStatus"

5 descriptives from the dataset

#Cleaning & seeing top words for xbox tweets

'%!in%' <- function(x,y)!('%in%'(x,y))

#Top 20 words for Xbox tweets

xbox_tw %>% 
  select(text,created) %>% 
  unnest_tokens(word,text)  %>% 
  anti_join(stop_words) %>% 
  filter(word %!in% c("https","t.co","rt","4","12","500","rangermj","person","tweet")) %>% 
  count(word,sort=TRUE) %>% 
  mutate(word=fct_reorder(word,n)) %>% 
  slice(1:20) %>% 
  ggplot(aes(y=word,x=n))+
  geom_col()+
  labs(title="Top words in Xbox tweets",
       y="",
       x="")+
  theme_light()
## Joining, by = "word"

#Cleaning & seeing top words for PS5 tweets

#Top 20 words for Xbox tweets

ps_tw %>% 
  select(text,created) %>% 
  unnest_tokens(word,text)  %>% 
  anti_join(stop_words) %>% 
  filter(word %!in% c("https","t.co","rt","4","12","500","leanandcuisine","glock","pay5n7yiup","1000","fe0f","aint")) %>% 
  count(word,sort=TRUE) %>% 
  mutate(word=fct_reorder(word,n)) %>% 
  slice(1:20) %>% 
  ggplot(aes(y=word,x=n))+
  geom_col()+
  labs(title="Top words in PS5 tweets",
       y="",
       x="")+
  theme_light()
## Joining, by = "word"

#Cleaning & seeing top words for Youtube comments

#Top 20 words from youtube tweets


yt %>% 
  select(textOriginal) %>% 
  unnest_tokens(word,textOriginal)  %>% 
  anti_join(stop_words) %>% 
  filter(word %!in% c("https","t.co","rt","4","12","500","2")) %>% 
  count(word,sort=TRUE) %>% 
  mutate(word=fct_reorder(word,n)) %>% 
  slice(1:20) %>% 
  ggplot(aes(y=word,x=n))+
  geom_col()+
  labs(title="Top words in Xbox tweets",
       y="",
       x="")+
  theme_light()
## Joining, by = "word"

Tweets by time for Xbox

xbox_tw %>% 
  mutate(created=floor_date(created, unit="2 minute")) %>% 
  count(created) %>% 
  ggplot(aes(x=created,y=n))+
  geom_line()+
  labs(title="Tweet trend of Xbox tweets on twitter",
       y="Count of tweets",
       x="Mention Time")+
  theme_light()

Tweets by time for PS

ps_tw %>% 
  mutate(created=floor_date(created, unit="2 minute")) %>% 
  count(created) %>% 
  ggplot(aes(x=created,y=n))+
  geom_line()+
  labs(title="Tweet trend of PS tweets on twitter",
       y="Count of tweets",
       x="Mention Time")+
  theme_light()

Comparisons

#Creating sentiment analysis for xbox & PS tweets & Youtube comments

p1 <- xbox_tw %>% 
  select(text,created) %>% 
  unnest_tokens(word,text)  %>% 
  anti_join(stop_words) %>% 
  inner_join(get_sentiments("nrc")) %>% 
  count(sentiment) %>% 
  mutate(sentiment=fct_reorder(sentiment,n)) %>% 
  ggplot(aes(x=n,y=sentiment,fill=sentiment))+
  geom_col(show.legend = FALSE)+
  theme_light()+
   labs(title="Sentiments from Xbox tweets",
       y="Sentiments",
       x="Frequency")
## Joining, by = "word"
## Joining, by = "word"
p2 <- ps_tw %>% 
  select(text,created) %>% 
  unnest_tokens(word,text)  %>% 
  anti_join(stop_words) %>% 
  inner_join(get_sentiments("nrc")) %>% 
  count(sentiment) %>% 
  mutate(sentiment=fct_reorder(sentiment,n)) %>% 
  ggplot(aes(x=n,y=sentiment,fill=sentiment))+
  geom_col(show.legend = FALSE)+
  theme_light()+
   labs(title="Sentiments from PS tweets",
       y="Sentiments",
       x="Frequency")
## Joining, by = "word"
## Joining, by = "word"
p3 <- yt %>% 
  select(textOriginal) %>% 
  unnest_tokens(word,textOriginal)  %>% 
  anti_join(stop_words) %>% 
  inner_join(get_sentiments("nrc")) %>% 
  count(sentiment) %>% 
  mutate(sentiment=fct_reorder(sentiment,n)) %>% 
  ggplot(aes(x=n,y=sentiment,fill=sentiment))+
  geom_col(show.legend = FALSE)+
  theme_light()+
   labs(title="Sentiments from Youtube comments",
       y="Sentiments",
       x="Frequency")
## Joining, by = "word"
## Joining, by = "word"
grid.arrange(p1,p2,p3)

What are the top positive words from Xbox, PS & YT tweets

p4 <- xbox_tw %>% 
  select(text,created) %>% 
  unnest_tokens(word,text)  %>% 
  anti_join(stop_words) %>% 
  inner_join(get_sentiments("bing")) %>% 
  count(word,sentiment,sort=TRUE) %>%
  filter(sentiment=="positive") %>% 
  mutate(word=fct_reorder(word,n)) %>% 
  slice(1:20) %>% 
  ggplot(aes(x=n,y=word,fill=word))+
  geom_col(show.legend = FALSE)+
  theme_light()+
   labs(title="Top positive words - Xbox",
       y="",
       x="Frequency")
## Joining, by = "word"
## Joining, by = "word"
p5 <- ps_tw %>% 
  select(text,created) %>% 
  unnest_tokens(word,text)  %>% 
  anti_join(stop_words) %>% 
  inner_join(get_sentiments("bing")) %>% 
  count(word,sentiment,sort=TRUE) %>%
  filter(sentiment=="positive") %>% 
  mutate(word=fct_reorder(word,n)) %>% 
  slice(1:20) %>% 
  ggplot(aes(x=n,y=word,fill=word))+
  geom_col(show.legend = FALSE)+
  theme_light()+
   labs(title="Top positive words - PS",
       y="",
       x="Frequency")
## Joining, by = "word"
## Joining, by = "word"
p6 <- yt %>% 
  select(textOriginal) %>% 
  unnest_tokens(word,textOriginal)  %>% 
  anti_join(stop_words) %>% 
  inner_join(get_sentiments("bing")) %>% 
  count(word,sentiment,sort=TRUE) %>%
  filter(sentiment=="positive") %>% 
  mutate(word=fct_reorder(word,n)) %>% 
  slice(1:20) %>% 
  ggplot(aes(x=n,y=word,fill=word))+
  geom_col(show.legend = FALSE)+
  theme_light()+
   labs(title="Top positive words - Youtube",
       y="",
       x="Frequency")
## Joining, by = "word"
## Joining, by = "word"
grid.arrange(p4,p5,p6,nrow=1)

Top negative words from Xbox, PS5 & youtube comments

What are the top positive words from Xbox, PS & YT tweets

p7 <- xbox_tw %>% 
  select(text,created) %>% 
  unnest_tokens(word,text)  %>% 
  anti_join(stop_words) %>% 
  inner_join(get_sentiments("bing")) %>% 
  count(word,sentiment,sort=TRUE) %>%
  filter(sentiment=="negative") %>% 
  mutate(word=fct_reorder(word,n)) %>% 
  slice(1:20) %>% 
  ggplot(aes(x=n,y=word,fill=word))+
  geom_col(show.legend = FALSE)+
  theme_light()+
   labs(title="Top negative words - Xbox",
       y="",
       x="Frequency")
## Joining, by = "word"
## Joining, by = "word"
p8 <- ps_tw %>% 
  select(text,created) %>% 
  unnest_tokens(word,text)  %>% 
  anti_join(stop_words) %>% 
  inner_join(get_sentiments("bing")) %>% 
  count(word,sentiment,sort=TRUE) %>%
  filter(sentiment=="negative") %>% 
  mutate(word=fct_reorder(word,n)) %>% 
  slice(1:20) %>% 
  ggplot(aes(x=n,y=word,fill=word))+
  geom_col(show.legend = FALSE)+
  theme_light()+
   labs(title="Top negative words - PS",
       y="",
       x="Frequency")
## Joining, by = "word"
## Joining, by = "word"
p9 <- yt %>% 
  select(textOriginal) %>% 
  unnest_tokens(word,textOriginal)  %>% 
  anti_join(stop_words) %>% 
  inner_join(get_sentiments("bing")) %>% 
  count(word,sentiment,sort=TRUE) %>%
  filter(sentiment=="negative") %>% 
  mutate(word=fct_reorder(word,n)) %>% 
  slice(1:20) %>% 
  ggplot(aes(x=n,y=word,fill=word))+
  geom_col(show.legend = FALSE)+
  theme_light()+
   labs(title="Top negative words- youtube",
       y="",
       x="Frequency")
## Joining, by = "word"
## Joining, by = "word"
grid.arrange(p7,p8,p9,nrow=1)

Creating wordcloud of words from Xbox, Ps & Youtube data

xbox_topwords <- xbox_tw %>% 
  select(text,created) %>% 
  unnest_tokens(word,text)  %>% 
  anti_join(stop_words) %>% 
  count(word)
## Joining, by = "word"
ps_topwords <- ps_tw %>% 
  select(text,created) %>% 
  unnest_tokens(word,text)  %>% 
  anti_join(stop_words) %>% 
  count(word)
## Joining, by = "word"
yt_topwords<- yt %>% 
  select(textOriginal) %>% 
  unnest_tokens(word,textOriginal)  %>% 
  anti_join(stop_words) %>% 
  count(word)
## Joining, by = "word"
pal <- brewer.pal(8, "Dark2")

Word cloud for Xbox

xbox_topwords %>% 
  filter(word %!in% c("https","t.co","rt","4","12","500","rangermj","person")) %>%
  with(wordcloud(word, n, random.order = FALSE, max.words = 50, colors=pal))

Word cloud for PS

ps_topwords %>% 
  filter(word %!in% c("https","t.co","rt","4","12","500","leanandcuisine","glock","pay5n7yiup","1000","fe0f","aint")) %>%
  with(wordcloud(word, n, random.order = FALSE, max.words = 50, colors=pal))

Word cloud for Youtube

yt_topwords %>% 
  filter(word %!in% c("https","t.co","rt","4","12","500","2")) %>%
  with(wordcloud(word, n, random.order = FALSE, max.words = 50, colors=pal))

Topic Modeling in R

xbox_words <- xbox_tw %>% 
  select(text) %>% 
  unnest_tokens(word,text)  %>% 
  anti_join(stop_words)
## Joining, by = "word"
ps_words <- ps_tw %>% 
  select(text) %>% 
  unnest_tokens(word,text)  %>% 
  anti_join(stop_words)
## Joining, by = "word"
yt_words <- yt %>% 
  select(textOriginal) %>% 
  unnest_tokens(word,textOriginal)  %>% 
  anti_join(stop_words) 
## Joining, by = "word"
myCorpus <- Corpus(VectorSource(xbox_words$word))  
dtm <- DocumentTermMatrix(myCorpus)
dtm_new <- removeSparseTerms(dtm,sparse = 0.999)
rowTotals <- apply(dtm_new , 1, sum) #Find the sum of words in each Document
dtm.new   <- dtm_new[rowTotals> 0, ]  
topic_xbox <- topicmodels::LDA(dtm.new, k = 3, control = list(seed = 1234))


x_topic <- tidy(topic_xbox)

x_top_terms <- x_topic %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


x <- x_top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered()+
  theme_light()+
  labs(title="Topics from Xbox tweets")


myCorpus_1 <- Corpus(VectorSource(ps_words$word))  
dtm_1 <- DocumentTermMatrix(myCorpus_1)
dtm_new_1 <- removeSparseTerms(dtm_1,sparse = 0.999)
rowTotals <- apply(dtm_new_1 , 1, sum) #Find the sum of words in each Document
dtm.new_1   <- dtm_new_1[rowTotals> 0, ]  
topic_ps <- topicmodels::LDA(dtm.new_1, k = 3, control = list(seed = 1234))


ps_topic <- tidy(topic_ps)

ps_top_terms <- ps_topic %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

ps <- ps_top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered()+
  theme_light()+
  labs(title="Topics from PS tweets")

myCorpus_2 <- Corpus(VectorSource(yt_words$word))  
dtm_2 <- DocumentTermMatrix(myCorpus_2)
dtm_new_2 <- removeSparseTerms(dtm_2,sparse = 0.999)
rowTotals <- apply(dtm_new_2 , 1, sum) #Find the sum of words in each Document
dtm.new_2   <- dtm_new_2[rowTotals> 0, ]  
topic_yt <- topicmodels::LDA(dtm.new_2, k = 3, control = list(seed = 1234))


yt_topic <- tidy(topic_yt)

yt_top_terms <- yt_topic %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

yt <- yt_top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered()+
  theme_light()+
  labs(title="Topics from YT comments")


grid.arrange(x,ps,yt)

Conclusion: My original intent was to understand how these two products were being received, and which one garnered the most positive acclaim. As stated above, I believed the Playstation would come out the clear favorite, but after this analysis, I come out needing to expand my datasets to get a clearer picture. I also need to collect more data from the YouTube comments as there seems to be more engagement.