\[\\[0.2in]\]

Exploratory data analysis

\[\\[0.1in]\]

Housekeeping

Loading needed libraries

library(dplyr)
library(tidytext)
library(ggplot2)
library(igraph)
library(ggraph)
library(wordcloud2)
library(rtweet)

\[\\[0.1in]\]

Read YouTube data from a CSV file

\[\\[0.001in]\]

youtube <- read.csv("tut1b.youtube.csv", header = T)

\[\\[0.001in]\]

\[\\[0.01in]\]

Analysis of words

# Data preparation for raw text
yt1 <- youtube %>%
  unnest_tokens(input = Comment, output = raw)
# Data preparation for clean text
yt2 <- youtube %>%
  unnest_tokens(input = clean_text, output = clean)
# Data preparation for cleaner text
yt3 <- youtube %>%
  unnest_tokens(input = cleaner_text, output = cleaner)

# Top words in raw text
yt1 %>% 
  count(raw, sort=TRUE) %>% 
  top_n(25)

## Selecting by n

##      raw     n
## 1    the 10987
## 2     to  6039
## 3    and  5783
## 4      a  5010
## 5     of  4682
## 6    you  4204
## 7      u  4117
## 8     is  4016
## 9     in  3140
## 10  that  3051
## 11     i  2999
## 12 trump  2749
## 13   for  2616
## 14    he  2270
## 15    it  2026
## 16   are  1980
## 17  this  1816
## 18   not  1748
## 19  they  1497
## 20   was  1482
## 21    on  1461
## 22  have  1442
## 23   all  1370
## 24    be  1283
## 25  with  1274

# Top words in clean text
yt2 %>% 
  count(clean, sort=TRUE) %>% 
  top_n(25)

## Selecting by n

##    clean     n
## 1    the 11006
## 2     to  6039
## 3    and  6002
## 4     is  5067
## 5      a  5000
## 6     of  4672
## 7    you  4491
## 8      i  3548
## 9   that  3378
## 10   not  3155
## 11    in  3134
## 12 trump  2803
## 13    it  2616
## 14   for  2615
## 15    he  2523
## 16    at  2494
## 17   are  2291
## 18  this  1808
## 19  they  1604
## 20  have  1554
## 21   was  1520
## 22    on  1451
## 23   all  1373
## 24  what  1348
## 25   out  1295

# Top words in cleaner text
yt3 %>% 
  count(cleaner, sort=TRUE) %>% 
  top_n(25)

## Selecting by n

##        cleaner    n
## 1        trump 2803
## 2       people 1211
## 3    president 1105
## 4     thousand  821
## 5       pelosi  628
## 6    democrats  602
## 7      country  564
## 8       speech  541
## 9      america  527
## 10    laughing  521
## 11       nancy  481
## 12        lies  443
## 13         god  417
## 14        time  389
## 15      twenty  383
## 16        loud  382
## 17    american  365
## 18       obama  308
## 19       world  308
## 20     hundred  306
## 21 republicans  296
## 22         don  284
## 23      excuse  276
## 24    sticking  260
## 25      tongue  259
## 26       white  259

Creating top words barplots

# Creating a barplot for top raw words
yt1 %>% count(raw, sort=TRUE) %>% top_n(20) %>%
  ggplot(aes(x= reorder(raw, n), y= n)) +
  geom_bar(stat ="identity", fill= "red")+
  coord_flip()+
  theme_classic()+
  labs(title="Raw words",
       x = "Words",
       y = "Frequency")
# Creating a barplot for top clean words
yt2 %>% count(clean, sort=TRUE) %>% top_n(20)%>%
  ggplot(aes(x= reorder(clean, n), y= n)) +
  geom_bar(stat ="identity", fill= "orange")+
  coord_flip()+
  theme_classic()+
  labs(title="Cleaned words",
       x = "Words",
       y = "Frequency")
# Creating a barplot for top cleaner words
yt3 %>% count(cleaner, sort=TRUE) %>% top_n(20)%>%
  ggplot(aes(x= reorder(cleaner, n), y= n)) +
  geom_bar(stat ="identity", fill= "darkgreen")+
  coord_flip()+
  theme_classic()+
  labs(title="Stopwords removed",
       x = "Words",
       y = "Frequency")

Create a word cloud of cleaner text

ytwc <- yt3 %>%
  count(cleaner, sort = T)
wordcloud2(ytwc, size=10)

Bigram analysis

# Bigram analysis
yt4 <- youtube %>%
  dplyr::select(cleaner_text) %>%
  unnest_tokens(paired_words, cleaner_text, token = "ngrams", n = 2)

# Count bigrams and create a bar chart
yt4 %>%
  count(paired_words, sort = T)  %>% 
  filter(!is.na(paired_words)) %>%
  top_n(25)%>%
  ggplot(aes(x= reorder(paired_words, n), y= n)) +
  geom_bar(stat ="identity", fill= "lightblue")+
  coord_flip()+
  theme_classic()+
  labs(title="Most frequent bigrams",
       x = "Bigram",
       y = "Frequency")

## Selecting by n

# Separate bigrams into two words
yt5 <- yt4 %>%
  tidyr::separate(paired_words, c("word1", "word2"), sep = " ") 

# Trigram analysis
yt6 <- yt5 %>%
  count(word1, word2, sort = T) %>%
  filter(!row_number() %in% 1)


set.seed(1234)
yt6 %>%
  filter(n >= 50) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_node_point(color = "blue", size = 3, alpha = 0.3) +
  geom_node_text(aes(label = name), vjust = 1.8, size = 3) + 
  geom_edge_link0(alpha = 0.6) +
  labs(title = "Bigram network, 50+ appearances") + 
  theme(panel.background = element_rect("white"))

## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## i Please use `linewidth` in the `default_aes` field and elsewhere instead.

\[\\[0.01in]\]

User data analysis

# Top commentators

youtube %>%
  count(AuthorDisplayName, sort=TRUE) %>%
  top_n(25)

## Selecting by n

##                                                                                                                                            AuthorDisplayName
## 1                                                                                                                                               Burt Collins
## 2                                                                                                                                                  Live Wire
## 3                                                                                                                                                  Nikko3001
## 4                                                                                                                                               Shane Letson
## 5                                                                                                                                        Robert C. Christian
## 6                                                                                                                                                 Eva Bailey
## 7                                                                                                                                                       Anke
## 8                                                                                                                                               Albert Green
## 9                                                                                                                                              Augie Rockero
## 10                                                                                                                                               crazy kelly
## 11                                                                                                                                                  Neesie B
## 12                                                                                                                                              Chris Garret
## 13                                                                                                                                                 Agent Fox
## 14                                                                                                                                              Butler Manny
## 15                                                                                                                                              Karen Sholar
## 16                                                                                                                                                Plague Rat
## 17                                                                                                                                              IG_ Survival
## 18 <U+0412><U+0430><U+043B><U+0435><U+0440><U+0438><U+0439> <U+0417><U+0430><U+043F><U+043E><U+0434><U+043E><U+0432><U+043D><U+0438><U+043A><U+043E><U+0432>
## 19                                                                                                                                                 Auntie H.
## 20                                                                                                                                                  Pat Holt
## 21                                                                                                                                                  Crown Uk
## 22                                                                                                                                                Dark Horse
## 23                                                                                                                                              david blasko
## 24                                                                                                                                              Jerry Asbury
## 25                                                                                                                                             Scott Chapman
##      n
## 1  170
## 2   66
## 3   58
## 4   52
## 5   49
## 6   48
## 7   47
## 8   37
## 9   37
## 10  35
## 11  31
## 12  30
## 13  29
## 14  26
## 15  26
## 16  26
## 17  25
## 18  24
## 19  24
## 20  23
## 21  22
## 22  22
## 23  22
## 24  22
## 25  22

Comments length

# Derive character count and word count
youtube$Ncharacters <- nchar(youtube$Comment)
youtube$Nwords <- lengths(strsplit(youtube$Comment, ' '))

# Descriptive statistics by VideoID
psych::describeBy(youtube, group = youtube$VideoID)

## 
##  Descriptive statistics by group 
## group: zNECVmfJtxc
##                        vars    n    mean      sd median trimmed     mad min
## X.1                       1 9633 4817.00 2780.95 4817.0 4817.00 3570.10   1
## X                         2 9633 4817.00 2780.95 4817.0 4817.00 3570.10   1
## Comment*                  3 9633 4748.58 2731.45 4753.0 4752.74 3515.24   1
## AuthorDisplayName*        4 9633 2307.75 1376.34 2321.0 2297.71 1822.12   1
## AuthorProfileImageUrl*    5 9633 2420.89 1365.81 2434.0 2429.56 1758.36   1
## AuthorChannelUrl*         6 9633 2372.67 1390.82 2350.0 2364.02 1829.53   1
## AuthorChannelID*          7 9633 2372.67 1390.82 2350.0 2364.02 1829.53   1
## ReplyCount                8 9633    0.62    3.44    0.0    0.06    0.00   0
## LikeCount                 9 9633    4.03   25.99    0.0    0.96    0.00   0
## PublishedAt*             10 9633 4602.15 2714.23 4564.0 4586.30 3504.87   1
## UpdatedAt*               11 9633 4599.23 2715.13 4562.0 4583.26 3507.83   1
## CommentID*               12 9633 4817.00 2780.95 4817.0 4817.00 3570.10   1
## ParentID*                13 5039  699.74  406.44  664.0  695.61  501.12   1
## VideoID*                 14 9633    1.00    0.00    1.0    1.00    0.00   1
## row_num                  15 9633 4817.00 2780.95 4817.0 4817.00 3570.10   1
## clean_text*              16 9633 4645.90 2702.40 4656.0 4653.65 3470.77   1
## cleaner_text*            17 9414 4530.30 2599.03 4530.5 4539.67 3354.38   1
## Ncharacters              18 9633  154.05  358.26   80.0   98.95   69.68   1
## Nwords                   19 9633   26.67   58.27   14.0   17.28   11.86   1
##                         max range  skew kurtosis    se
## X.1                    9633  9632  0.00    -1.20 28.33
## X                      9633  9632  0.00    -1.20 28.33
## Comment*               9463  9462 -0.01    -1.20 27.83
## AuthorDisplayName*     4723  4722  0.03    -1.24 14.02
## AuthorProfileImageUrl* 4737  4736 -0.04    -1.20 13.92
## AuthorChannelUrl*      4759  4758  0.03    -1.24 14.17
## AuthorChannelID*       4759  4758  0.03    -1.24 14.17
## ReplyCount              108   108 14.30   290.06  0.04
## LikeCount              1063  1063 24.25   767.82  0.26
## PublishedAt*           9360  9359  0.04    -1.21 27.65
## UpdatedAt*             9358  9357  0.04    -1.22 27.66
## CommentID*             9633  9632  0.00    -1.20 28.33
## ParentID*              1428  1427  0.09    -1.17  5.73
## VideoID*                  1     0   NaN      NaN  0.00
## row_num                9633  9632  0.00    -1.20 28.33
## clean_text*            9295  9294 -0.01    -1.20 27.53
## cleaner_text*          8981  8980 -0.02    -1.21 26.79
## Ncharacters            9574  9573 13.57   255.83  3.65
## Nwords                 1447  1446 12.19   211.54  0.59

# Quantiles for LikeCount and ReplyCount
quantile(youtube$LikeCount, seq(from = 0, to = 1, by = 0.1))

##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##    0    0    0    0    0    0    1    1    3    6 1063

quantile(youtube$ReplyCount, seq(from = 0, to = 1, by = 0.1))

##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##    0    0    0    0    0    0    0    0    0    1  108

# Histogram of word count by VideoID
youtube %>%
  ggplot(aes(x=Nwords, fill = Nwords)) +
  geom_histogram() +
  facet_wrap(~VideoID) +
  theme_minimal()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: The following aesthetics were dropped during statistical transformation: fill
## i This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## i Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Like and reply counts?

# Histogram of LikeCount by VideoID
youtube %>%
  ggplot(aes(x=LikeCount, fill = LikeCount)) +
  geom_histogram() +
  facet_wrap(~VideoID) +
  theme_minimal()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: The following aesthetics were dropped during statistical transformation: fill
## i This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## i Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

# Histogram of ReplyCount by VideoID
youtube %>%
  ggplot(aes(x=ReplyCount, fill = ReplyCount)) +
  geom_histogram() +
  facet_wrap(~VideoID) +
  theme_minimal()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: The following aesthetics were dropped during statistical transformation: fill
## i This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## i Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

# Scatterplot of LikeCount vs. ReplyCount
plot(youtube$LikeCount, youtube$ReplyCount)
abline(lm(LikeCount ~ ReplyCount, data = youtube), lwd = 3, col = "blue")

cor.test(youtube$LikeCount, youtube$ReplyCount)

## 
##  Pearson's product-moment correlation
## 
## data:  youtube$LikeCount and youtube$ReplyCount
## t = 86.193, df = 9631, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6484765 0.6710278
## sample estimates:
##       cor 
## 0.6599007

# Scatterplot of LikeCount vs. Nwords
plot(youtube$LikeCount, youtube$Nwords)
abline(lm(LikeCount ~ Nwords, data = youtube), lwd = 3, col = "blue")

cor.test(youtube$LikeCount, youtube$Nwords)

## 
##  Pearson's product-moment correlation
## 
## data:  youtube$LikeCount and youtube$Nwords
## t = -1.6555, df = 9631, p-value = 0.09786
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.036824126  0.003104437
## sample estimates:
##         cor 
## -0.01686657

# Boxplot of LikeCount by VideoID
ggplot(youtube, aes(x=VideoID, y=LikeCount)) + 
  geom_boxplot() +
  ylim(0,10)

## Warning: Removed 651 rows containing non-finite values (`stat_boxplot()`).

# Boxplot of LikeCount by VideoID (no ylim)
ggplot(youtube, aes(x=VideoID, y=LikeCount)) + 
  geom_boxplot()

DSO2008 2023-2024 Working with Big Data

Eliyahu V. Sapir

13 November 2023

Exploratory data analysis

Housekeeping

Loading needed libraries

Read YouTube data from a CSV file

Analysis of words

Creating top words barplots

Create a word cloud of cleaner text

Bigram analysis

User data analysis

Comments length

Like and reply counts?