In this assignment, you should start by getting the primary example code from chapter 2 working in an R Markdown document. You should provide a citation to this base code. You’re then asked to extend the code in two ways: 1. Work with a different corpus of your choosing, and 2. Incorporate at least one additional sentiment lexicon
####Loading Libraries and sentiments
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE
)))
) %>%
ungroup() %>%
unnest_tokens(word, text)
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 303 x 2
## word n
## <chr> <int>
## 1 good 359
## 2 young 192
## 3 friend 166
## 4 hope 143
## 5 happy 125
## 6 love 117
## 7 deal 92
## 8 found 92
## 9 present 89
## 10 kind 82
## # ... with 293 more rows
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
## `summarise()` ungrouping output (override with `.groups` argument)
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c(
"positive",
"negative"
))) %>%
mutate(method = "NRC")
) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
bind_rows(
afinn,
bing_and_nrc
) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
get_sentiments("nrc") %>%
filter(sentiment %in% c(
"positive",
"negative"
)) %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 3324
## 2 positive 2312
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts
## # A tibble: 2,585 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 well positive 1523
## 3 good positive 1380
## 4 great positive 981
## 5 like positive 725
## 6 better positive 639
## 7 enough positive 613
## 8 happy positive 534
## 9 love positive 495
## 10 pleasure positive 462
## # ... with 2,575 more rows
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(
y = "Contribution to sentiment",
x = NULL
) +
coord_flip()
## Selecting by n
custom_stop_words <- bind_rows(
tibble(
word = c("miss"),
lexicon = c("custom")
),
stop_words
)
custom_stop_words
## # A tibble: 1,150 x 2
## word lexicon
## <chr> <chr>
## 1 miss custom
## 2 a SMART
## 3 a's SMART
## 4 able SMART
## 5 about SMART
## 6 above SMART
## 7 according SMART
## 8 accordingly SMART
## 9 across SMART
## 10 actually SMART
## # ... with 1,140 more rows
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(
colors = c("gray20", "gray80"),
max.words = 100
)
## Joining, by = "word"
PandP_sentences <- tibble(text = prideprejudice) %>%
unnest_tokens(sentence, text, token = "sentences")
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text,
token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]"
) %>%
ungroup()
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 2
## book chapters
## <fct> <int>
## 1 Sense & Sensibility 51
## 2 Pride & Prejudice 62
## 3 Mansfield Park 49
## 4 Emma 56
## 5 Northanger Abbey 32
## 6 Persuasion 25
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
## `summarise()` regrouping output by 'book' (override with `.groups` argument)
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords / words) %>%
filter(chapter != 0) %>%
top_n(1) %>%
ungroup()
## Joining, by = "word"
## `summarise()` regrouping output by 'book' (override with `.groups` argument)
## Selecting by ratio
## # A tibble: 6 x 5
## book chapter negativewords words ratio
## <fct> <int> <int> <int> <dbl>
## 1 Sense & Sensibility 43 161 3405 0.0473
## 2 Pride & Prejudice 34 111 2104 0.0528
## 3 Mansfield Park 46 173 3685 0.0469
## 4 Emma 15 151 3340 0.0452
## 5 Northanger Abbey 21 149 2982 0.0500
## 6 Persuasion 4 62 1807 0.0343
# loading book
dorian <- gutenberg_works(title == 'The Picture of Dorian Gray')
dorian_corpus <- gutenberg_download(dorian$gutenberg_id)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
# creating a data frame by tokenization of chapters, paragraphs, and words
dc <- dorian_corpus %>%
mutate(
chapter_num = cumsum(str_detect(text, regex('^chapter [0-9]+', ignore_case = TRUE)))
) %>%
filter(chapter_num != 0) %>%
unnest_tokens(chapter, text, token = 'regex', pattern = 'CHAPTER [0-9]+') %>%
unnest_tokens(paragraph, chapter, token = 'paragraphs') %>%
group_by(chapter_num) %>%
mutate(paragraph_num = 1:n()) %>%
ungroup() %>%
unnest_tokens(word, paragraph) %>%
anti_join(stop_words)
## Joining, by = "word"
# Words by Chapter
dc %>%
group_by(chapter_num) %>%
count(word, sort = TRUE) %>%
mutate(chapter_num = reorder(chapter_num, n)) %>%
ggplot(aes(chapter_num, n)) +
geom_col(fill = 'steel blue') +
ylab('Word Count') +
coord_flip()
# Top 10 words in book
dc %>%
count(word, sort = TRUE) %>%
top_n(10) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word,n)) +
geom_col(fill = 'steel blue') +
ylab('Word Count') +
coord_flip()
## Selecting by n
# Top words by chapter
dc %>%
group_by(chapter_num) %>%
count(word) %>%
top_n(1) %>%
ungroup() %>%
arrange(chapter_num)
## Selecting by n
## # A tibble: 23 x 3
## chapter_num word n
## <int> <chr> <int>
## 1 1 dorian 22
## 2 1 harry 22
## 3 1 lord 22
## 4 2 dorian 43
## 5 3 lord 34
## 6 4 harry 29
## 7 5 mother 26
## 8 5 sibyl 26
## 9 6 dorian 26
## 10 7 dorian 25
## # ... with 13 more rows
# getting sentiment of words storing as vector using syuzhet lexicon
sentiment_vector <- get_sentiment(dc$word, method = 'syuzhet')
dc <- cbind(dc, sentiment_vector)
# calculating sentiment by paragraph
dc_sent_para <- dc %>%
group_by(chapter_num, paragraph_num) %>%
summarize(sentiment = sum(sentiment_vector))
## `summarise()` regrouping output by 'chapter_num' (override with `.groups` argument)
# now the time that we've been waiting for
# graph using syuzhet lexicon
ggplot(data = dc_sent_para, aes(paragraph_num, sentiment, fill = chapter_num)) +
geom_col() +
facet_wrap(~ chapter_num)
# graph of proportion of positive and negative paragraphs
dc_sent_para %>%
mutate(sentiment2 = ifelse(sentiment >= 0, 'positve', 'negative')) %>%
group_by(sentiment2) %>%
summarize(n = n()) %>%
mutate(freq = round(n / sum(n),2)) %>%
ggplot(aes(x = sentiment2, y = freq)) +
geom_col(fill = 'steelblue') +
geom_text(aes(label = freq), vjust = 2, color = 'white', size = 5)
## `summarise()` ungrouping output (override with `.groups` argument)
# wordcloud using loughran lexicon
dc %>%
inner_join(get_sentiments('loughran')) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = 'n', fill = 0) %>%
comparison.cloud(
colors = c("gray20", "gray80"),
max.words = 100
)
## Joining, by = "word"
“The Picture of Dorian Gray” has almost the equal amounts of positive and negative sentiment paragraphs. What is surprising is that the last chapter of the book contains almost only negative sentiment paragraphs.