Introduction

This code is taken from chapter 2 of Text Mining With R by Julia Silge and David Robinson. I will apply the code to a different corpus of text and incorporate an additional lexicon.

library(tidytext)
## Warning: package 'tidytext' was built under R version 3.6.3
library(textdata)
## Warning: package 'textdata' was built under R version 3.6.3
library(janeaustenr)
## Warning: package 'janeaustenr' was built under R version 3.6.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(stringr)

tidy_books <- austen_books() %>%
  group_by(book) %>%  
  mutate(linenumber = row_number(),chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)
  
nrcjoy <- get_sentiments("nrc") %>%
  filter(sentiment == "joy")
  
janeaustensentiment <- tidy_books %>%  
  inner_join(get_sentiments("bing")) %>%  
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining, by = "word"
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
ggplot(janeaustensentiment, aes(index, sentiment, fill = book)) +  
  geom_col(show.legend = FALSE) +  
  facet_wrap(~book, ncol = 2, scales = "free_x")

* Title: Text Mining With R Chapter 2 code * Author: Silge, J * Date: 2017 * Availability: http://dl.booktolearn.com/ebooks2/computer/programming/9781491981658_Text_Mining_with_R_f482.pdf

jeopardy <- read.csv('JEOPARDY_CSV.csv')
head(jeopardy, 3)
##   Show.Number   Air.Date     Round                        Category Value
## 1        4680 2004-12-31 Jeopardy!                         HISTORY  $200
## 2        4680 2004-12-31 Jeopardy! ESPN's TOP 10 ALL-TIME ATHLETES  $200
## 3        4680 2004-12-31 Jeopardy!     EVERYBODY TALKS ABOUT IT...  $200
##                                                                                                      Question
## 1            For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory
## 2 No. 2: 1912 Olympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Braves
## 3                    The city of Yuma in this state has a record average of 4,055 hours of sunshine each year
##       Answer
## 1 Copernicus
## 2 Jim Thorpe
## 3    Arizona

This is a dataset of Jeopardy questions and answers from various shows, and includes information such as the type of round, the value of the question, the category, and more.

question_tokens <- jeopardy %>%
  mutate(Question = as.character(Question)) %>%
  unnest_tokens(word, Question)
question_sentiment <- question_tokens %>%  
  inner_join(get_sentiments("loughran")) %>%
  count(Answer, Round, sentiment) %>%
  spread(sentiment, n, fill = 0) %>% 
  mutate(sentiment = positive - negative) %>%
  arrange(Round) %>%
  mutate(index = c(1:sum(Round == 'Double Jeopardy!'), 1:sum(Round == 'Final Jeopardy!'), 1:sum(Round == 'Jeopardy!'), 1:sum(Round == 'Tiebreaker')))
## Joining, by = "word"

This creates a similar data frame to the one we created from Jane Austen books, and we have it indexed by round so that we can plot it in a similar way, making a subplot for each round type. I used the loughran lexicon this time.

ggplot(question_sentiment, aes(index, sentiment, fill = Round)) +  
  geom_col(show.legend = FALSE) +  
  facet_wrap(~Round, ncol = 2, scales = "free_x")