\[\\[0.2in]\]
\[\\[0.1in]\]
library(dplyr)
library(tidytext)
library(ggplot2)
library(igraph)
library(ggraph)
library(wordcloud2)
library(rtweet)
\[\\[0.1in]\]
\[\\[0.001in]\]
youtube <- read.csv("tut1b.youtube.csv", header = T)
\[\\[0.001in]\]
\[\\[0.01in]\]
# Data preparation for raw text
yt1 <- youtube %>%
unnest_tokens(input = Comment, output = raw)
# Data preparation for clean text
yt2 <- youtube %>%
unnest_tokens(input = clean_text, output = clean)
# Data preparation for cleaner text
yt3 <- youtube %>%
unnest_tokens(input = cleaner_text, output = cleaner)
# Top words in raw text
yt1 %>%
count(raw, sort=TRUE) %>%
top_n(25)
## Selecting by n
## raw n
## 1 the 10987
## 2 to 6039
## 3 and 5783
## 4 a 5010
## 5 of 4682
## 6 you 4204
## 7 u 4117
## 8 is 4016
## 9 in 3140
## 10 that 3051
## 11 i 2999
## 12 trump 2749
## 13 for 2616
## 14 he 2270
## 15 it 2026
## 16 are 1980
## 17 this 1816
## 18 not 1748
## 19 they 1497
## 20 was 1482
## 21 on 1461
## 22 have 1442
## 23 all 1370
## 24 be 1283
## 25 with 1274
# Top words in clean text
yt2 %>%
count(clean, sort=TRUE) %>%
top_n(25)
## Selecting by n
## clean n
## 1 the 11006
## 2 to 6039
## 3 and 6002
## 4 is 5067
## 5 a 5000
## 6 of 4672
## 7 you 4491
## 8 i 3548
## 9 that 3378
## 10 not 3155
## 11 in 3134
## 12 trump 2803
## 13 it 2616
## 14 for 2615
## 15 he 2523
## 16 at 2494
## 17 are 2291
## 18 this 1808
## 19 they 1604
## 20 have 1554
## 21 was 1520
## 22 on 1451
## 23 all 1373
## 24 what 1348
## 25 out 1295
# Top words in cleaner text
yt3 %>%
count(cleaner, sort=TRUE) %>%
top_n(25)
## Selecting by n
## cleaner n
## 1 trump 2803
## 2 people 1211
## 3 president 1105
## 4 thousand 821
## 5 pelosi 628
## 6 democrats 602
## 7 country 564
## 8 speech 541
## 9 america 527
## 10 laughing 521
## 11 nancy 481
## 12 lies 443
## 13 god 417
## 14 time 389
## 15 twenty 383
## 16 loud 382
## 17 american 365
## 18 obama 308
## 19 world 308
## 20 hundred 306
## 21 republicans 296
## 22 don 284
## 23 excuse 276
## 24 sticking 260
## 25 tongue 259
## 26 white 259
# Creating a barplot for top raw words
yt1 %>% count(raw, sort=TRUE) %>% top_n(20) %>%
ggplot(aes(x= reorder(raw, n), y= n)) +
geom_bar(stat ="identity", fill= "red")+
coord_flip()+
theme_classic()+
labs(title="Raw words",
x = "Words",
y = "Frequency")
# Creating a barplot for top clean words
yt2 %>% count(clean, sort=TRUE) %>% top_n(20)%>%
ggplot(aes(x= reorder(clean, n), y= n)) +
geom_bar(stat ="identity", fill= "orange")+
coord_flip()+
theme_classic()+
labs(title="Cleaned words",
x = "Words",
y = "Frequency")
# Creating a barplot for top cleaner words
yt3 %>% count(cleaner, sort=TRUE) %>% top_n(20)%>%
ggplot(aes(x= reorder(cleaner, n), y= n)) +
geom_bar(stat ="identity", fill= "darkgreen")+
coord_flip()+
theme_classic()+
labs(title="Stopwords removed",
x = "Words",
y = "Frequency")
ytwc <- yt3 %>%
count(cleaner, sort = T)
wordcloud2(ytwc, size=10)
# Bigram analysis
yt4 <- youtube %>%
dplyr::select(cleaner_text) %>%
unnest_tokens(paired_words, cleaner_text, token = "ngrams", n = 2)
# Count bigrams and create a bar chart
yt4 %>%
count(paired_words, sort = T) %>%
filter(!is.na(paired_words)) %>%
top_n(25)%>%
ggplot(aes(x= reorder(paired_words, n), y= n)) +
geom_bar(stat ="identity", fill= "lightblue")+
coord_flip()+
theme_classic()+
labs(title="Most frequent bigrams",
x = "Bigram",
y = "Frequency")
## Selecting by n
# Separate bigrams into two words
yt5 <- yt4 %>%
tidyr::separate(paired_words, c("word1", "word2"), sep = " ")
# Trigram analysis
yt6 <- yt5 %>%
count(word1, word2, sort = T) %>%
filter(!row_number() %in% 1)
set.seed(1234)
yt6 %>%
filter(n >= 50) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_node_point(color = "blue", size = 3, alpha = 0.3) +
geom_node_text(aes(label = name), vjust = 1.8, size = 3) +
geom_edge_link0(alpha = 0.6) +
labs(title = "Bigram network, 50+ appearances") +
theme(panel.background = element_rect("white"))
## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## i Please use `linewidth` in the `default_aes` field and elsewhere instead.
\[\\[0.01in]\]
# Top commentators
youtube %>%
count(AuthorDisplayName, sort=TRUE) %>%
top_n(25)
## Selecting by n
## AuthorDisplayName
## 1 Burt Collins
## 2 Live Wire
## 3 Nikko3001
## 4 Shane Letson
## 5 Robert C. Christian
## 6 Eva Bailey
## 7 Anke
## 8 Albert Green
## 9 Augie Rockero
## 10 crazy kelly
## 11 Neesie B
## 12 Chris Garret
## 13 Agent Fox
## 14 Butler Manny
## 15 Karen Sholar
## 16 Plague Rat
## 17 IG_ Survival
## 18 <U+0412><U+0430><U+043B><U+0435><U+0440><U+0438><U+0439> <U+0417><U+0430><U+043F><U+043E><U+0434><U+043E><U+0432><U+043D><U+0438><U+043A><U+043E><U+0432>
## 19 Auntie H.
## 20 Pat Holt
## 21 Crown Uk
## 22 Dark Horse
## 23 david blasko
## 24 Jerry Asbury
## 25 Scott Chapman
## n
## 1 170
## 2 66
## 3 58
## 4 52
## 5 49
## 6 48
## 7 47
## 8 37
## 9 37
## 10 35
## 11 31
## 12 30
## 13 29
## 14 26
## 15 26
## 16 26
## 17 25
## 18 24
## 19 24
## 20 23
## 21 22
## 22 22
## 23 22
## 24 22
## 25 22
# Histogram of LikeCount by VideoID
youtube %>%
ggplot(aes(x=LikeCount, fill = LikeCount)) +
geom_histogram() +
facet_wrap(~VideoID) +
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: The following aesthetics were dropped during statistical transformation: fill
## i This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## i Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
# Histogram of ReplyCount by VideoID
youtube %>%
ggplot(aes(x=ReplyCount, fill = ReplyCount)) +
geom_histogram() +
facet_wrap(~VideoID) +
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: The following aesthetics were dropped during statistical transformation: fill
## i This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## i Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
# Scatterplot of LikeCount vs. ReplyCount
plot(youtube$LikeCount, youtube$ReplyCount)
abline(lm(LikeCount ~ ReplyCount, data = youtube), lwd = 3, col = "blue")
cor.test(youtube$LikeCount, youtube$ReplyCount)
##
## Pearson's product-moment correlation
##
## data: youtube$LikeCount and youtube$ReplyCount
## t = 86.193, df = 9631, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6484765 0.6710278
## sample estimates:
## cor
## 0.6599007
# Scatterplot of LikeCount vs. Nwords
plot(youtube$LikeCount, youtube$Nwords)
abline(lm(LikeCount ~ Nwords, data = youtube), lwd = 3, col = "blue")
cor.test(youtube$LikeCount, youtube$Nwords)
##
## Pearson's product-moment correlation
##
## data: youtube$LikeCount and youtube$Nwords
## t = -1.6555, df = 9631, p-value = 0.09786
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.036824126 0.003104437
## sample estimates:
## cor
## -0.01686657
# Boxplot of LikeCount by VideoID
ggplot(youtube, aes(x=VideoID, y=LikeCount)) +
geom_boxplot() +
ylim(0,10)
## Warning: Removed 651 rows containing non-finite values (`stat_boxplot()`).
# Boxplot of LikeCount by VideoID (no ylim)
ggplot(youtube, aes(x=VideoID, y=LikeCount)) +
geom_boxplot()
Comments length