TextAnalysis

A Text Analysis of The Johnny Depp and Amber Heard Trial, Through Youtube Comments

For this project, I got csv’s for the youtube comments I hypothesis that over time, the trial will become more popular each day. I also think that the sentiment of comments will become more positive over time, since people grew to love Depp during the trial.

First, I am going to download all of the necessary libraries I will be using for this project.

library(tidyverse)
library(tidytext)
library(textdata)
library(devtools)
install_github('lchiffon/wordcloud2')

Next, I will import the data from the csv files I found on github.

library(readr)
day1 <- read_csv("~/Desktop/texxtanalysis project/day1.csv")

## Warning: One or more parsing issues, see `problems()` for details

View(day1)

## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/Resources/
## modules/R_de.so'' had status 1

library(readr)
day2 <- read_csv("~/Desktop/texxtanalysis project/day2.csv")

## Warning: One or more parsing issues, see `problems()` for details

View(day2)

## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/Resources/
## modules/R_de.so'' had status 1

library(readr)
day3 <- read_csv("~/Desktop/texxtanalysis project/day3.csv")

## Warning: One or more parsing issues, see `problems()` for details

View(day3)

## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/Resources/
## modules/R_de.so'' had status 1

library(readr)
day4 <- read_csv("~/Desktop/texxtanalysis project/day4.csv")

## Warning: One or more parsing issues, see `problems()` for details

View(day4)

## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/Resources/
## modules/R_de.so'' had status 1

library(readr)
day5 <- read_csv("~/Desktop/texxtanalysis project/day5.csv")
View(day5)

## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/Resources/
## modules/R_de.so'' had status 1

library(readr)
day6 <- read_csv("~/Desktop/texxtanalysis project/day6.csv")
View(day6)

## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/Resources/
## modules/R_de.so'' had status 1

library(readr)
day7 <- read_csv("~/Desktop/texxtanalysis project/day7.csv")
View(day7)

## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/Resources/
## modules/R_de.so'' had status 1

library(readr)
day8 <- read_csv("~/Desktop/texxtanalysis project/day8.csv")
View(day8)

## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/Resources/
## modules/R_de.so'' had status 1

library(readr)
day10 <- read_csv("~/Desktop/texxtanalysis project/day10.csv")
View(day10)

## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/Resources/
## modules/R_de.so'' had status 1

library(readr)
day12 <- read_csv("~/Desktop/texxtanalysis project/day12.csv")
View(day12)

## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/Resources/
## modules/R_de.so'' had status 1

library(readr)
day15 <- read_csv("~/Desktop/texxtanalysis project/day15.csv")
View(day15)

## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/Resources/
## modules/R_de.so'' had status 1

library(readr)
day16 <- read_csv("~/Desktop/texxtanalysis project/day16.csv")
View(day16)

## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/Resources/
## modules/R_de.so'' had status 1

library(readr)
day17 <- read_csv("~/Desktop/texxtanalysis project/day17.csv")
View(day17)

## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/Resources/
## modules/R_de.so'' had status 1

library(readr)
day18 <- read_csv("~/Desktop/texxtanalysis project/day18 2.csv")
View(day18)

## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/Resources/
## modules/R_de.so'' had status 1

library(readr)
day19 <- read_csv("~/Desktop/texxtanalysis project/day19.csv")
View(day19)

## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/Resources/
## modules/R_de.so'' had status 1

library(readr)
day20 <- read_csv("~/Desktop/texxtanalysis project/day20.csv")
View(day20)

## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/Resources/
## modules/R_de.so'' had status 1

library(readr)
day21 <- read_csv("~/Desktop/texxtanalysis project/day21.csv")
View(day21)

## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/Resources/
## modules/R_de.so'' had status 1

library(readr)
day22 <- read_csv("~/Desktop/texxtanalysis project/day22.csv")
View(day22)

## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/Resources/
## modules/R_de.so'' had status 1

library(readr)
day23 <- read_csv("~/Desktop/texxtanalysis project/day23.csv")
View(day23)

## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/Resources/
## modules/R_de.so'' had status 1

library(readr)
day24 <- read_csv("~/Desktop/texxtanalysis project/day24.csv")
View(day24)

## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/Resources/
## modules/R_de.so'' had status 1

Now, I am going to use unest_tokens so that I can break up the comment into individual words, so that I am able to do an analysis.

day1_words <- day1 %>% 
  unnest_tokens (word,messages)

day2_words <- day2 %>% 
  unnest_tokens (word,messages)

day3_words <- day3 %>% 
  unnest_tokens (word,messages)

day4_words <- day4 %>% 
  unnest_tokens (word,messages)

day5_words <- day5 %>% 
  unnest_tokens (word,messages)

day6_words <- day6 %>% 
  unnest_tokens (word,messages)

day7_words <- day7 %>% 
  unnest_tokens (word,messages)

day8_words <- day8 %>% 
  unnest_tokens (word,messages)

day10_words <- day10 %>% 
  unnest_tokens (word,messages)

day12_words <- day12 %>% 
  unnest_tokens (word,messages)

day15_words <- day15 %>% 
  unnest_tokens (word,messages)

day16_words <- day16 %>% 
  unnest_tokens (word,messages)

day17_words <- day17 %>% 
  unnest_tokens (word,messages)

day18_words <- day18 %>% 
  unnest_tokens (word,messages)

day19_words <- day19 %>% 
  unnest_tokens (word,messages)

day20_words <- day20 %>% 
  unnest_tokens (word,messages)

day21_words <- day21 %>% 
  unnest_tokens (word,messages)

day22_words <- day22 %>% 
  unnest_tokens (word,messages)

day23_words <- day23 %>% 
  unnest_tokens (word,messages)

day24_words <- day24 %>% 
  unnest_tokens (word, messages)

Let’s find out how many words were commented on day 1 of the trial compared to 24. This will give us insight on if the trial became more popular, which I will analyze soon.

day1_words %>% 
  count -> day1count

day24_words %>% 
  count -> day24count

day1count

## # A tibble: 1 × 1
##        n
##    <int>
## 1 339109

day24count

## # A tibble: 1 × 1
##         n
##     <int>
## 1 1318056

Day 1 had 339,109 words written in the comments, and Day 24 had 1,318,056 words in the comment! That is 3.87x the words. This shows that over the trial from the first day to the last day that the number of YouTube comments increased, most likely meaning that the viewers itself grew over time. Either way, this shows there was more engagement in the trial.

Now, I am going to create variables for each day so that I am able later to graph the word count, and allow us to give a better visualization on the popularity of the trial over time, rather than just on Day 1 and Day 24.

day2_words %>% 
  count -> day2count

day3_words %>% 
  count -> day3count

day4_words %>% 
  count -> day4count

day5_words %>% 
  count -> day5count

day6_words %>% 
  count -> day6count

day7_words %>% 
  count -> day7count

day8_words %>% 
  count -> day8count

day10_words %>% 
  count -> day10count

day12_words %>% 
  count -> day12count

day15_words %>% 
  count -> day15count

day16_words %>% 
  count -> day16count

day17_words %>% 
  count -> day17count

day18_words %>% 
  count -> day18count

day19_words %>% 
  count -> day19count

day20_words %>% 
  count -> day20count

day21_words %>% 
  count -> day21count

day22_words %>% 
  count -> day22count

day23_words %>% 
  count -> day23count

Now I am going to mutate the days, specifically by the word count each day so I can merge them together and later graph over time the number of comments to see if the engagement constantly increased or not.

day1count %>% 
  mutate(Day=1)-> day1countmerge

day2count %>% 
  mutate(Day=2) -> day2countmerge

day3count %>% 
  mutate(Day=3) -> day3countmerge

day4count %>% 
  mutate(Day=4) -> day4countmerge

day5count %>% 
  mutate(Day=5) -> day5countmerge

day6count %>% 
  mutate(Day=6) -> day6countmerge

day7count %>% 
  mutate(Day=7) -> day7countmerge

day8count %>% 
  mutate(Day=8) -> day8countmerge

day10count %>% 
  mutate(Day=10) -> day10countmerge

day12count %>% 
  mutate(Day=12) -> day12countmerge

day15count %>% 
  mutate(Day=15) -> day15countmerge

day16count %>% 
  mutate(Day=16) -> day16countmerge

day17count %>% 
  mutate(Day=17) -> day17countmerge

day18count %>% 
  mutate(Day=18) -> day18countmerge

day20count %>% 
  mutate(Day=20) -> day20countmerge

day21count %>% 
  mutate(Day=21) -> day21countmerge

day22count %>% 
  mutate(Day=22) -> day22countmerge

day23count %>% 
  mutate(Day=23) -> day23countmerge

day24count %>% 
  mutate(Day=24) -> day24countmerge

Now I can actually merge together my variables to make a graph!

day1countmerge %>% 
  full_join(day2countmerge) %>% 
  full_join(day3countmerge) %>% 
  full_join(day4countmerge) %>% 
  full_join(day5countmerge) %>% 
  full_join(day6countmerge) %>% 
  full_join(day7countmerge) %>% 
  full_join(day8countmerge) %>% 
  full_join(day10countmerge) %>% 
  full_join(day12countmerge) %>% 
  full_join(day15countmerge) %>% 
  full_join(day16countmerge) %>% 
  full_join(day17countmerge) %>% 
  full_join(day18countmerge) %>% 
  full_join(day20countmerge) %>% 
  full_join(day21countmerge) %>% 
  full_join(day22countmerge) %>% 
  full_join(day23countmerge) %>% 
  full_join(day24countmerge) -> mergedcount

## Joining, by = c("n", "Day")
## Joining, by = c("n", "Day")
## Joining, by = c("n", "Day")
## Joining, by = c("n", "Day")
## Joining, by = c("n", "Day")
## Joining, by = c("n", "Day")
## Joining, by = c("n", "Day")
## Joining, by = c("n", "Day")
## Joining, by = c("n", "Day")
## Joining, by = c("n", "Day")
## Joining, by = c("n", "Day")
## Joining, by = c("n", "Day")
## Joining, by = c("n", "Day")
## Joining, by = c("n", "Day")
## Joining, by = c("n", "Day")
## Joining, by = c("n", "Day")
## Joining, by = c("n", "Day")
## Joining, by = c("n", "Day")

Now I will create a line plot with the number of comments on the y axis and the days on the x axis so we can see the number of comments over time.

mergedcount %>%
  
  arrange(desc(n)) %>%
  
  ggplot(aes(Day, n)) + geom_line()

We see in the graph the comments never get lower than they were from the first four days, then it peaks dramatically. We see the most comments were on day 20, not 24.

Let’s see what are the most common words on day 1 compared to day 24? I will include stop_words to filter out words we don’t want (if, then, etc).

day1_words %>% 
  anti_join(stop_words) %>% 
  count(word, sort=TRUE)

## Joining, by = "word"

## # A tibble: 19,205 × 2
##    word                              n
##    <chr>                         <int>
##  1 johnny                         8569
##  2 amber                          4910
##  3 justice                        3552
##  4 red_heart                      2948
##  5 clown_face                     2325
##  6 depp                           2325
##  7 face_with_tears_of_joy         1269
##  8 face_vomiting                  1090
##  9 rolling_on_the_floor_laughing  1040
## 10 justiceforjohnnydepp            987
## # … with 19,195 more rows

We see the top 10 most common words include johnny, amber, justice, red_heart, clown_face depp face_with_tears_of_joy face_vomiting, rolling_on_the_floor_laughing, and justiceforjohhnydepp.

Now let’s see the most common words for day 24.

day24_words %>% 
  anti_join(stop_words) %>% 
  count(word, sort=TRUE)

## Joining, by = "word"

## # A tibble: 57,943 × 2
##    word                              n
##    <chr>                         <int>
##  1 johnny                        27228
##  2 blue_heart                    25783
##  3 red_heart                     22531
##  4 amber                         15780
##  5 justice                        9648
##  6 face_with_tears_of_joy         9582
##  7 rolling_on_the_floor_laughing  8095
##  8 depp                           6491
##  9 camille                        6208
## 10 verdict                        6078
## # … with 57,933 more rows

The most common words included johnny, blue_heart, red_heart, amber, justice, face_with_tears_of_joy, rolling_on_the_floor_laughing, depp, camille, and verdict. This makes sense that verdict is now a popular word since this was toward the last day of the trial.

Now I am going to create variables by the words said on each day so I can merge to graph words over time.

day1_words %>%
  
  anti_join(stop_words) %>%
  
  count(word, sort = TRUE) %>%
  
  mutate(Day = 1) -> day1_to_merge

## Joining, by = "word"

day2_words %>%
  
  anti_join(stop_words) %>%
  
  count(word, sort = TRUE) %>%
  
  mutate(Day = 2) -> day2_to_merge

## Joining, by = "word"

day3_words %>%
  
  anti_join(stop_words) %>%
  
  count(word, sort = TRUE) %>%
  
  mutate(Day = 3) -> day3_to_merge

## Joining, by = "word"

day4_words %>%
  
  anti_join(stop_words) %>%
  
  count(word, sort = TRUE) %>%
  
  mutate(Day = 4) -> day4_to_merge

## Joining, by = "word"

day5_words %>%
  
  anti_join(stop_words) %>%
  
  count(word, sort = TRUE) %>%
  
  mutate(Day = 5) -> day5_to_merge

## Joining, by = "word"

day6_words %>%
  
  anti_join(stop_words) %>%
  
  count(word, sort = TRUE) %>%
  
  mutate(Day = 6) -> day6_to_merge

## Joining, by = "word"

day7_words %>%
  
  anti_join(stop_words) %>%
  
  count(word, sort = TRUE) %>%
  
  mutate(Day = 7) -> day7_to_merge

## Joining, by = "word"

day8_words %>%
  
  anti_join(stop_words) %>%
  
  count(word, sort = TRUE) %>%
  
  mutate(Day = 8) -> day8_to_merge

## Joining, by = "word"

day10_words %>%
  
  anti_join(stop_words) %>%
  
  count(word, sort = TRUE) %>%
  
  mutate(Day = 10) -> day10_to_merge

## Joining, by = "word"

day12_words %>%
  
  anti_join(stop_words) %>%
  
  count(word, sort = TRUE) %>%
  
  mutate(Day = 12) -> day12_to_merge

## Joining, by = "word"

day15_words %>%
  
  anti_join(stop_words) %>%
  
  count(word, sort = TRUE) %>%
  
  mutate(Day = 15) -> day15_to_merge

## Joining, by = "word"

day16_words %>%
  
  anti_join(stop_words) %>%
  
  count(word, sort = TRUE) %>%
  
  mutate(Day = 16) -> day16_to_merge

## Joining, by = "word"

day17_words %>%
  
  anti_join(stop_words) %>%
  
  count(word, sort = TRUE) %>%
  
  mutate(Day = 17) -> day17_to_merge

## Joining, by = "word"

day18_words %>%
  
  anti_join(stop_words) %>%
  
  count(word, sort = TRUE) %>%
  
  mutate(Day = 18) -> day18_to_merge

## Joining, by = "word"

day19_words %>%
  
  anti_join(stop_words) %>%
  
  count(word, sort = TRUE) %>%
  
  mutate(Day = 19) -> day19_to_merge

## Joining, by = "word"

day20_words %>%
  
  anti_join(stop_words) %>%
  
  count(word, sort = TRUE) %>%
  
  mutate(Day = 20) -> day20_to_merge

## Joining, by = "word"

day21_words %>%
  
  anti_join(stop_words) %>%
  
  count(word, sort = TRUE) %>%
  
  mutate(Day = 21) -> day21_to_merge

## Joining, by = "word"

day22_words %>%
  
  anti_join(stop_words) %>%
  
  count(word, sort = TRUE) %>%
  
  mutate(Day = 22) -> day22_to_merge

## Joining, by = "word"

day23_words %>%
  
  anti_join(stop_words) %>%
  
  count(word, sort = TRUE) %>%
  
  mutate(Day = 23) -> day23_to_merge

## Joining, by = "word"

day24_words %>%
  
  anti_join(stop_words) %>%
  
  count(word, sort = TRUE) %>%
  
  mutate(Day = 24) -> day24_to_merge

## Joining, by = "word"

Now I will actually merge these variables together.

day1_to_merge %>%
  full_join(day2_to_merge) %>% 
  full_join(day3_to_merge) %>% 
  full_join(day4_to_merge) %>% 
  full_join(day5_to_merge) %>% 
  full_join(day6_to_merge) %>% 
  full_join(day7_to_merge) %>% 
  full_join(day8_to_merge) %>% 
  full_join(day10_to_merge) %>% 
  full_join(day12_to_merge) %>% 
  full_join(day15_to_merge) %>% 
  full_join(day16_to_merge) %>% 
  full_join(day17_to_merge) %>% 
  full_join(day18_to_merge) %>% 
  full_join(day19_to_merge) %>% 
  full_join(day20_to_merge) %>% 
  full_join(day21_to_merge) %>% 
  full_join(day22_to_merge) %>%
  full_join(day23_to_merge) -> merged_days

## Joining, by = c("word", "n", "Day")
## Joining, by = c("word", "n", "Day")
## Joining, by = c("word", "n", "Day")
## Joining, by = c("word", "n", "Day")
## Joining, by = c("word", "n", "Day")
## Joining, by = c("word", "n", "Day")
## Joining, by = c("word", "n", "Day")
## Joining, by = c("word", "n", "Day")
## Joining, by = c("word", "n", "Day")
## Joining, by = c("word", "n", "Day")
## Joining, by = c("word", "n", "Day")
## Joining, by = c("word", "n", "Day")
## Joining, by = c("word", "n", "Day")
## Joining, by = c("word", "n", "Day")
## Joining, by = c("word", "n", "Day")
## Joining, by = c("word", "n", "Day")
## Joining, by = c("word", "n", "Day")
## Joining, by = c("word", "n", "Day")

With the line graph I am about to graph, we can see the most frequent words used during the trial in the key on the right, with the number of times they were said over time.

merged_days %>%
  
  arrange(desc(n)) %>%
  
  head(50) %>%
  
  ggplot(aes(Day, n, color = word)) + geom_line()

Now i will create a word cloud for day 1 maximizing at 50 words. This visual is easier to see what words were frequently used, and how much they were used compared to one another.

library(wordcloud2)
day1_words %>% 
  anti_join(stop_words) %>% 
  count(word, sort=TRUE) %>%
  head(50) %>% 
  wordcloud2()

## Joining, by = "word"

Through this I can see that ‘johnny’ and ‘amber’ are obviosuly noticable. It is interesting to see ‘red_heart’ and ‘clown_face’ which represents emojis!

Now I will create a word cloud for day 24.

library(wordcloud2)
day24_words %>% 
  anti_join(stop_words) %>% 
  count(word, sort=TRUE) %>%
  head(50) %>% 
  wordcloud2()

## Joining, by = "word"

It is interesting that ‘johnny’ and ‘red_heart’ are more noticeable compared to ‘amber’. Also, the ‘rolling_on_the_floor_laughing’ emoji is extremely noticeable and ‘justice’ is as well.

Now, I want to see the most positive words for day 1.

day1_words %>% 
  anti_join(stop_words) %>% 
  inner_join(get_sentiments('afinn')) %>% 
  count(word, value, sort=TRUE) %>% 
  arrange(desc(value)) ->day1pos

## Joining, by = "word"
## Joining, by = "word"

day1pos

## # A tibble: 1,066 × 3
##    word        value     n
##    <chr>       <dbl> <int>
##  1 outstanding     5     3
##  2 win             4   397
##  3 lmao            4   274
##  4 wow             4   231
##  5 wins            4   141
##  6 funny           4    82
##  7 winning         4    78
##  8 lmfao           4    42
##  9 amazing         4    39
## 10 fun             4    34
## # … with 1,056 more rows

The most positive words include outstanding, win, lmao, wow, wins, funny, winning, lmfao, amazing, and fun. Let’s compare with the most positive words for day 24.

day24_words %>% 
  anti_join(stop_words) %>% 
  inner_join(get_sentiments('afinn')) %>% 
  count(word, value, sort=TRUE) %>% 
  arrange(desc(value))

## Joining, by = "word"
## Joining, by = "word"

## # A tibble: 1,450 × 3
##    word        value     n
##    <chr>       <dbl> <int>
##  1 outstanding     5    16
##  2 superb          5     6
##  3 hurrah          5     5
##  4 win             4  4436
##  5 wins            4  1923
##  6 lmao            4  1096
##  7 wow             4   836
##  8 amazing         4   380
##  9 winning         4   361
## 10 funny           4   350
## # … with 1,440 more rows

This includes outstanding, superb, hurrah, win, winds, lamo, wow, amazing, winning, funny. Now lets look at the most negative words for day 1!

day1_words %>% 
  anti_join(stop_words) %>% 
  inner_join(get_sentiments('afinn')) %>% 
  count(word, value, sort=TRUE) %>% 
  arrange(desc(-value)) -> day_1_negative_words

## Joining, by = "word"
## Joining, by = "word"

day_1_negative_words

## # A tibble: 1,066 × 3
##    word          value     n
##    <chr>         <dbl> <int>
##  1 motherfucking    -5     1
##  2 wtf              -4   362
##  3 torture          -4     5
##  4 fraud            -4     4
##  5 damned           -4     2
##  6 damn             -4     1
##  7 hell             -4     1
##  8 shrew            -4     1
##  9 tortured         -4     1
## 10 torturing        -4     1
## # … with 1,056 more rows

This includes motherfucking, wtf, torture, fraud, damned, damn, hell, shrew tortured, torturing. Let’s compare with most negative words for day 24.

day24_words %>% 
  anti_join(stop_words) %>% 
  inner_join(get_sentiments('afinn')) %>% 
  count(word, value, sort=TRUE) %>% 
  arrange(desc(-value))

## Joining, by = "word"
## Joining, by = "word"

## # A tibble: 1,450 × 3
##    word         value     n
##    <chr>        <dbl> <int>
##  1 wtf             -4   728
##  2 fraud           -4    25
##  3 torture         -4    14
##  4 damned          -4     6
##  5 catastrophic    -4     2
##  6 tortured        -4     2
##  7 damn            -4     1
##  8 frauds          -4     1
##  9 fraudsters      -4     1
## 10 fraudulent      -4     1
## # … with 1,440 more rows

Includes wtf, fraud, torture, damned, catastrophic, tortured, damn, frauds, fraudsters, fraudulent.

Now, I am going to see what the most positive and negative words are.

day1_words %>% 
  group_by(word) %>% 
  count(word, sort=TRUE) %>% 
  inner_join(get_sentiments('bing')) -> day1bing

## Joining, by = "word"

day1bing

## # A tibble: 1,668 × 3
## # Groups:   word [1,667]
##    word          n sentiment
##    <chr>     <int> <chr>    
##  1 like       1528 positive 
##  2 love        943 positive 
##  3 lies        587 negative 
##  4 objection   579 negative 
##  5 good        559 positive 
##  6 bad         402 negative 
##  7 win         397 positive 
##  8 right       374 positive 
##  9 support     309 positive 
## 10 guilty      299 negative 
## # … with 1,658 more rows

Now I am going to plot these results so we can visualize them easier.

day1bing %>%
  group_by(word) %>%
  arrange(desc(n)) %>%
  #count(sentiment, sort = TRUE) %>%
  head(10) %>%
  ggplot( aes(reorder(word, n),n, fill = sentiment)) + geom_col() +
  coord_flip()

Now I want to group sentiments by day, then calculate mean for each day. This will allow us to see how positvie or negative the comments are.

day1_words %>% 
 inner_join(get_sentiments('afinn')) -> day1_words_afinn

## Joining, by = "word"

mean(day1_words_afinn$value) -> day1mean

day2_words %>% 
  inner_join(get_sentiments('afinn')) -> day2_words_afinn

## Joining, by = "word"

mean(day2_words_afinn$value) -> day2mean

day3_words %>% 
  inner_join(get_sentiments('afinn')) -> day3_words_afinn

## Joining, by = "word"

mean(day3_words_afinn$value) -> day3mean

day4_words %>% 
  inner_join(get_sentiments('afinn')) -> day4_words_afinn

## Joining, by = "word"

mean(day4_words_afinn$value) -> day4mean

day5_words %>% 
  inner_join(get_sentiments('afinn')) -> day5_words_afinn

## Joining, by = "word"

mean(day5_words_afinn$value) -> day5mean

day6_words %>% 
  inner_join(get_sentiments('afinn')) -> day6_words_afinn

## Joining, by = "word"

mean(day6_words_afinn$value) -> day6mean
  
day7_words %>% 
  inner_join(get_sentiments('afinn')) -> day7_words_afinn

## Joining, by = "word"

mean(day7_words_afinn$value) -> day7mean

day8_words %>% 
  inner_join(get_sentiments('afinn')) -> day8_words_afinn

## Joining, by = "word"

mean(day8_words_afinn$value) -> day8mean

day10_words %>% 
  inner_join(get_sentiments('afinn')) -> day10_words_afinn

## Joining, by = "word"

mean(day10_words_afinn$value) -> day10mean

day12_words %>% 
  inner_join(get_sentiments('afinn')) -> day12_words_afinn

## Joining, by = "word"

mean(day12_words_afinn$value) -> day12mean

day15_words %>% 
  inner_join(get_sentiments('afinn')) -> day15_words_afinn

## Joining, by = "word"

mean(day15_words_afinn$value) -> day15mean

day16_words %>% 
  inner_join(get_sentiments('afinn')) -> day16_words_afinn

## Joining, by = "word"

mean(day16_words_afinn$value) -> day16mean


day17_words %>% 
  inner_join(get_sentiments('afinn')) -> day17_words_afinn

## Joining, by = "word"

mean(day17_words_afinn$value) -> day17mean

day18_words %>% 
  inner_join(get_sentiments('afinn')) -> day18_words_afinn

## Joining, by = "word"

mean(day18_words_afinn$value) -> day18mean

day19_words %>% 
  inner_join(get_sentiments('afinn')) -> day19_words_afinn

## Joining, by = "word"

mean(day19_words_afinn$value) -> day19mean

day20_words %>% 
  inner_join(get_sentiments('afinn')) -> day20_words_afinn

## Joining, by = "word"

mean(day20_words_afinn$value) -> day20mean

day21_words %>% 
  inner_join(get_sentiments('afinn')) -> day21_words_afinn

## Joining, by = "word"

mean(day21_words_afinn$value) -> day21mean

day22_words %>% 
  inner_join(get_sentiments('afinn')) -> day22_words_afinn

## Joining, by = "word"

mean(day22_words_afinn$value) -> day22mean

day23_words %>% 
  inner_join(get_sentiments('afinn')) -> day23_words_afinn

## Joining, by = "word"

mean(day1_words_afinn$value) -> day23mean

day24_words %>% 
  inner_join(get_sentiments('afinn')) -> day24_words_afinn

## Joining, by = "word"

mean(day24_words_afinn$value) -> day24mean

Now I am going to graph the mean values.

merged_days %>% 
  inner_join(get_sentiments('afinn')) %>% 
  group_by(Day) %>% 
  summarize(total = sum(value)) %>% 
  ggplot(aes(Day, total, fill = total)) + geom_col() + geom_smooth()

## Joining, by = "word"
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

This graph starts at 0 at the top, and goes down to negative numbers. Here, we can see that the trial always had a negative sentiment. There are missing days in the data which explains the gap in the graph. It is interesting to see how negative the values are from day 15 on.

TextAnalysis

Christina Alescio

2022-10-19

A Text Analysis of The Johnny Depp and Amber Heard Trial, Through Youtube Comments