DATA 607 - Week 10 Assignment

Assignment 10 - Description

In Text Mining with R, Chapter 2 looks at Sentiment Analysis. In this assignment, you should start by getting the primary example code from chapter 2 working in an R Markdown document. You should provide a citation to this base code. You’re then asked to extend the code in two ways:

Work with a different corpus of your choosing, and
Incorporate at least one additional sentiment lexicon (possibly from another R package that you’ve found through research).

As usual, please submit links to both an .Rmd file posted in your GitHub repository and to your code on rpubs.com. You make work on a small team on this assignment.

Primary example code from chapter 2 of “Text Mining with R”

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.3.0      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(tidytext)

## Warning: package 'tidytext' was built under R version 4.2.3

library(textdata)

## Warning: package 'textdata' was built under R version 4.2.3

library(janeaustenr)

## Warning: package 'janeaustenr' was built under R version 4.2.3

library(dplyr)
library(stringr)
library(tidyr)
library(ggplot2)
library(wordcloud)

## Warning: package 'wordcloud' was built under R version 4.2.3

## Loading required package: RColorBrewer

library(reshape2)

## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths

library(lexicon)

## Warning: package 'lexicon' was built under R version 4.2.3

library(readr)

The function get_sentiments() allows us to get specific sentiment lexicons with the appropriate measures for each one.

get_sentiments("afinn")

## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # … with 2,467 more rows

get_sentiments("bing")

## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # … with 6,776 more rows

get_sentiments("nrc")

## # A tibble: 13,872 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # … with 13,862 more rows

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(), chapter = cumsum(str_detect(text,  regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")


tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)

## # A tibble: 301 × 2
##    word          n
##    <chr>     <int>
##  1 good        359
##  2 friend      166
##  3 hope        143
##  4 happy       125
##  5 love        117
##  6 deal         92
##  7 found        92
##  8 present      89
##  9 kind         82
## 10 happiness    76
## # … with 291 more rows

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")

pride_prejudice

## # A tibble: 122,204 × 4
##    book              linenumber chapter word     
##    <fct>                  <int>   <int> <chr>    
##  1 Pride & Prejudice          1       0 pride    
##  2 Pride & Prejudice          1       0 and      
##  3 Pride & Prejudice          1       0 prejudice
##  4 Pride & Prejudice          3       0 by       
##  5 Pride & Prejudice          3       0 jane     
##  6 Pride & Prejudice          3       0 austen   
##  7 Pride & Prejudice          7       1 chapter  
##  8 Pride & Prejudice          7       1 1        
##  9 Pride & Prejudice         10       1 it       
## 10 Pride & Prejudice         10       1 is       
## # … with 122,194 more rows

afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")

bing_and_nrc <- bind_rows(
  pride_prejudice %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3316
## 2 positive   2308

get_sentiments("bing") %>% 
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

bing_word_counts

## # A tibble: 2,585 × 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 miss     negative   1855
##  2 well     positive   1523
##  3 good     positive   1380
##  4 great    positive    981
##  5 like     positive    725
##  6 better   positive    639
##  7 enough   positive    613
##  8 happy    positive    534
##  9 love     positive    495
## 10 pleasure positive    462
## # … with 2,575 more rows

bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

custom_stop_words <- bind_rows(tibble(word = c("miss"),  
                                      lexicon = c("custom")), 
                               stop_words)

custom_stop_words

## # A tibble: 1,150 × 2
##    word        lexicon
##    <chr>       <chr>  
##  1 miss        custom 
##  2 a           SMART  
##  3 a's         SMART  
##  4 able        SMART  
##  5 about       SMART  
##  6 above       SMART  
##  7 according   SMART  
##  8 accordingly SMART  
##  9 across      SMART  
## 10 actually    SMART  
## # … with 1,140 more rows

tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)

p_and_p_sentences <- tibble(text = prideprejudice) %>% 
  unnest_tokens(sentence, text, token = "sentences")

p_and_p_sentences$sentence[2]

## [1] "by jane austen"

#> [1] "by jane austen"

austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())

## # A tibble: 6 × 2
##   book                chapters
##   <fct>                  <int>
## 1 Sense & Sensibility       51
## 2 Pride & Prejudice         62
## 3 Mansfield Park            49
## 4 Emma                      56
## 5 Northanger Abbey          32
## 6 Persuasion                25

bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())

tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  slice_max(ratio, n = 1) %>% 
  ungroup()

## # A tibble: 6 × 5
##   book                chapter negativewords words  ratio
##   <fct>                 <int>         <int> <int>  <dbl>
## 1 Sense & Sensibility      43           161  3405 0.0473
## 2 Pride & Prejudice        34           111  2104 0.0528
## 3 Mansfield Park           46           173  3685 0.0469
## 4 Emma                     15           151  3340 0.0452
## 5 Northanger Abbey         21           149  2982 0.0500
## 6 Persuasion                4            62  1807 0.0343

My Corpus

I will be using the Loughran, SentiWordNet2 and SlangSD4 lexicon for my sentiment analysis of the tv show”The Office”

library(schrute)

## Warning: package 'schrute' was built under R version 4.2.3

library(tibble)
library(childesr)

## Warning: package 'childesr' was built under R version 4.2.3

library(readr)

# Lexicons SenticNet and SentiWordNet
office_sentiword <- hash_sentiment_sentiword
names(office_sentiword) <- c("word","score")

# Lexicons SlangSD
office_slangsd <- hash_sentiment_slangsd
names(office_slangsd) <- c("word","score")

# Load dataset
theOfficeData <- schrute::theoffice
glimpse(theOfficeData)

## Rows: 55,130
## Columns: 12
## $ index            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
## $ season           <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ episode          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ episode_name     <chr> "Pilot", "Pilot", "Pilot", "Pilot", "Pilot", "Pilot",…
## $ director         <chr> "Ken Kwapis", "Ken Kwapis", "Ken Kwapis", "Ken Kwapis…
## $ writer           <chr> "Ricky Gervais;Stephen Merchant;Greg Daniels", "Ricky…
## $ character        <chr> "Michael", "Jim", "Michael", "Jim", "Michael", "Micha…
## $ text             <chr> "All right Jim. Your quarterlies look very good. How …
## $ text_w_direction <chr> "All right Jim. Your quarterlies look very good. How …
## $ imdb_rating      <dbl> 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6…
## $ total_votes      <int> 3706, 3706, 3706, 3706, 3706, 3706, 3706, 3706, 3706,…
## $ air_date         <chr> "2005-03-24", "2005-03-24", "2005-03-24", "2005-03-24…

office_season <- theOfficeData %>%
filter(season %in%  c(5, 6))
#filter(season %in%  c(5, 6) &  episode == 1)

#Examine the dataset
glimpse(office_season)

## Rows: 15,365
## Columns: 12
## $ index            <int> 18560, 18561, 18562, 18563, 18564, 18565, 18566, 1856…
## $ season           <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,…
## $ episode          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ episode_name     <chr> "Weight Loss (Parts 1&2)", "Weight Loss (Parts 1&2)",…
## $ director         <chr> "Paul Feig", "Paul Feig", "Paul Feig", "Paul Feig", "…
## $ writer           <chr> "Lee Eisenberg;Gene Stupnitsky", "Lee Eisenberg;Gene …
## $ character        <chr> "Michael", "Dwight", "Michael", "Meredith", "Stanley"…
## $ text             <chr> "All right, everybody. This is your last meal, so eat…
## $ text_w_direction <chr> "All right, everybody. This is your last meal, so eat…
## $ imdb_rating      <dbl> 8.8, 8.8, 8.8, 8.8, 8.8, 8.8, 8.8, 8.8, 8.8, 8.8, 8.8…
## $ total_votes      <int> 2501, 2501, 2501, 2501, 2501, 2501, 2501, 2501, 2501,…
## $ air_date         <chr> "2008-09-25", "2008-09-25", "2008-09-25", "2008-09-25…

# Creating a subset of the data that will later be used to perform tidying
office_season <- office_season[c("text", "character", "index", "season", "episode", "episode_name", "imdb_rating", "total_votes", "air_date" )]

# Remove rows with missing values if there are any
office_season <- office_season %>%
  filter(!is.na(character), !is.na(text))

# Remove punctuation and convert to lowercase
office_season$text <- tolower(gsub("[^[:alnum:] ]", "", office_season$text))

office_season

## # A tibble: 15,365 × 9
##    text             chara…¹ index season episode episo…² imdb_…³ total…⁴ air_d…⁵
##    <chr>            <chr>   <int>  <int>   <int> <chr>     <dbl>   <int> <chr>  
##  1 all right every… Michael 18560      5       1 Weight…     8.8    2501 2008-0…
##  2 from this point… Dwight  18561      5       1 Weight…     8.8    2501 2008-0…
##  3 this summer cor… Michael 18562      5       1 Weight…     8.8    2501 2008-0…
##  4 what should we … Meredi… 18563      5       1 Weight…     8.8    2501 2008-0…
##  5 im taking the d… Stanley 18564      5       1 Weight…     8.8    2501 2008-0…
##  6 no no no this i… Dwight  18565      5       1 Weight…     8.8    2501 2008-0…
##  7 i can bring the… Creed   18566      5       1 Weight…     8.8    2501 2008-0…
##  8 im taking my du… Stanley 18567      5       1 Weight…     8.8    2501 2008-0…
##  9 there take thos… Dwight  18568      5       1 Weight…     8.8    2501 2008-0…
## 10 dwight dwight    Michael 18569      5       1 Weight…     8.8    2501 2008-0…
## # … with 15,355 more rows, and abbreviated variable names ¹character,
## #   ²episode_name, ³imdb_rating, ⁴total_votes, ⁵air_date

Tidying the data

tidy_office <- office_season %>%
  group_by(character) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

char_sentimentM <- tidy_office %>% 
  filter(character == "Michael")

char_sentimentM

## # A tibble: 50,684 × 11
##    charac…¹ index season episode episo…² imdb_…³ total…⁴ air_d…⁵ linen…⁶ chapter
##    <chr>    <int>  <int>   <int> <chr>     <dbl>   <int> <chr>     <int>   <int>
##  1 Michael  18560      5       1 Weight…     8.8    2501 2008-0…       1       0
##  2 Michael  18560      5       1 Weight…     8.8    2501 2008-0…       1       0
##  3 Michael  18560      5       1 Weight…     8.8    2501 2008-0…       1       0
##  4 Michael  18560      5       1 Weight…     8.8    2501 2008-0…       1       0
##  5 Michael  18560      5       1 Weight…     8.8    2501 2008-0…       1       0
##  6 Michael  18560      5       1 Weight…     8.8    2501 2008-0…       1       0
##  7 Michael  18560      5       1 Weight…     8.8    2501 2008-0…       1       0
##  8 Michael  18560      5       1 Weight…     8.8    2501 2008-0…       1       0
##  9 Michael  18560      5       1 Weight…     8.8    2501 2008-0…       1       0
## 10 Michael  18560      5       1 Weight…     8.8    2501 2008-0…       1       0
## # … with 50,674 more rows, 1 more variable: word <chr>, and abbreviated
## #   variable names ¹character, ²episode_name, ³imdb_rating, ⁴total_votes,
## #   ⁵air_date, ⁶linenumber

#sentiment <- get_sentiments("loughran")
#sentiment

loughran_word_cnt <- char_sentimentM %>%
  inner_join(get_sentiments("loughran")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

loughran_word_cnt

## # A tibble: 303 × 3
##    word      sentiment       n
##    <chr>     <chr>       <int>
##  1 good      positive      192
##  2 could     uncertainty    89
##  3 great     positive       71
##  4 ill       negative       57
##  5 best      positive       47
##  6 believe   uncertainty    36
##  7 better    positive       36
##  8 maybe     uncertainty    34
##  9 wrong     negative       26
## 10 beautiful positive       25
## # … with 293 more rows

loughran_word_cnt %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, ncol = 2, scales = "free_y") +
  theme(axis.text.x = element_text(angle = 70, hjust = 1, size=8))  +
  labs(x = "Contribution to sentiment",
       y = NULL)

Create sentiments based on each of the lexicons

sentiment_sentiword <- char_sentimentM %>% 
  inner_join(office_sentiword, by = "word") %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(score)) %>%
  mutate(method = "SENTIWORD")%>% 
 mutate(PosNeg = ifelse(sentiment > 0, "Positive", "Negative"))

sentiment_slangsd <- char_sentimentM %>% 
  inner_join(office_slangsd, by = "word") %>%
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(score)) %>%
  mutate(method = "SLANGSD")%>% 
 mutate(PosNeg = ifelse(sentiment > 0, "Positive", "Negative"))

sentiment_afinn <- char_sentimentM %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")%>% 
 mutate(PosNeg = ifelse(sentiment > 0, "Positive", "Negative"))

sentiment_loughran <- char_sentimentM %>% 
  inner_join(get_sentiments("loughran")) %>%  
  filter(sentiment %in% c("positive", "negative")) %>% 
  group_by(index = linenumber %/% 80) %>%
  mutate(method = "LOUGHRAN")%>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative) %>%
  mutate(PosNeg = ifelse(sentiment > 0, "Positive", "Negative"))

Plot and compare each lexicons

bind_rows(sentiment_sentiword, sentiment_slangsd, sentiment_afinn, sentiment_loughran) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

plot sentiment for each lexicon (Positive & Negative)

ggplot(sentiment_slangsd, aes(index, sentiment, fill = PosNeg)) +
  geom_col(show.legend = FALSE) + labs(title = "slangSD")

ggplot(sentiment_sentiword, aes(index, sentiment, fill = PosNeg)) +
  geom_col(show.legend = FALSE) + labs(title = "SENTIWORD")

ggplot(sentiment_loughran, aes(index, sentiment, fill = PosNeg)) +
  geom_col(show.legend = FALSE) + labs(title = "LOUGHRAN")

ggplot(sentiment_afinn, aes(index, sentiment, fill = PosNeg)) +
  geom_col(show.legend = FALSE) + labs(title = "AFINN")

Display the most freuently used words joining on loughran lexicon

loughran_lex_count <- tidy_office %>%
  inner_join(get_sentiments("loughran")) %>%
  filter(!is.na(sentiment)) %>%
  count(word, sentiment, sort = TRUE)
    
loughran_lex_count

## # A tibble: 601 × 3
##    word    sentiment       n
##    <chr>   <chr>       <int>
##  1 good    positive      507
##  2 could   uncertainty   300
##  3 great   positive      228
##  4 ill     negative      228
##  5 maybe   uncertainty   154
##  6 better  positive      114
##  7 bad     negative      105
##  8 best    positive       98
##  9 might   uncertainty    92
## 10 believe uncertainty    86
## # … with 591 more rows

loughran_lex_count %>%
  group_by(sentiment) %>%
  top_n(5) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, ncol = 2, scales = "free_y") +
  labs(y = "Total Sentiment",
       x = "Words From The Office Script") +
  theme(axis.text.x = element_text(angle = 70, hjust = 1, size=8))  +
  coord_flip()

office_word_freq <- tidy_office %>%
  inner_join(get_sentiments("loughran")) %>%
  filter(!is.na(sentiment)) %>%
  filter(sentiment %in% c("positive", "negative"))%>%

  count(word, sentiment, sort = TRUE)
    
office_word_freq

## # A tibble: 474 × 3
##    word      sentiment     n
##    <chr>     <chr>     <int>
##  1 good      positive    507
##  2 great     positive    228
##  3 ill       negative    228
##  4 better    positive    114
##  5 bad       negative    105
##  6 best      positive     98
##  7 happy     positive     81
##  8 wrong     negative     80
##  9 problem   negative     45
## 10 beautiful positive     40
## # … with 464 more rows

Comparison Word Cloud of loughran sentiment values positive, “negative, superfluous and litigious

tidy_office %>%
  inner_join(get_sentiments("loughran")) %>%
  filter(sentiment %in% c("positive", "negative", "superfluous", "litigious"))%>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("purple", "green"),
                   max.words = 100)

## Warning in comparison.cloud(., colors = c("purple", "green"), max.words = 100):
## amorphous could not be fit on page. It will not be plotted.

Comparison Word Cloud of loughran sentiment values positive, “negative, superfluous and litigious joing on stop words

tidy_office %>%
  inner_join(get_sentiments("loughran")) %>%
  filter(sentiment %in% c("positive", "negative", "superfluous", "litigious"))%>%
    anti_join(stop_words) %>% 
    count(word) %>%
    with(wordcloud(word, colors = c("purple", "green"),n, max.words = 100))

Plot the most frequently used words in the Episode that I am analyzing

top_10 <- office_word_freq %>%
  filter(n >= 20)

ggplot(top_10, aes(x = word, y = n, fill = sentiment)) + 
theme(axis.text.x = element_text(angle = 70, hjust = 1, size=8))  +
  labs(title = 'MOST FREQUENTLY USED WORDS')+
  labs(x = "Frequently used words",
       y = "Total Sentiment") +
geom_col()

Conclusion

After reviewing the data for the tv show “The Office” and performing sentiment analysis, I can conclude that there were more positive and uncertain words. Overall SlangSD showed more negative word usage.

Reference

https://medium.com/nerd-for-tech/sentiment-analysis-lexicon-models-vs-machine-learning-b6e3af8fe746#:~:text=AFINN%20Lexicon%20is%20the%20most,along%20with%20it’s%20polarity%20score.