Week 10 Assignment
Attached Files:
Week 10 Assignment Rubric.pdf (45.194 KB)
\(As\ usual,\ please\ submit\ links\ to\ both\ an\ .Rmd\ file\ posted\ in\ your\ GitHub\ repository\ and\ to\ your\ code\ on\ rpubs.com.\)\(\ You\ make\ work\ on\ a\ small\ team\ on\ this\ assignment.\)
Below are examples 2.1 - 2.6 were used verbatim from Text Mining with R1
The sentiments datasets
## # A tibble: 2,477 x 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ... with 2,467 more rows
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ... with 6,776 more rows
## # A tibble: 13,901 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ... with 13,891 more rows
Sentiment analysis with inner join
tidy_books <-
austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter =
cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)nrc_joy <-
get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)## Joining, by = "word"
## # A tibble: 303 x 2
## word n
## <chr> <int>
## 1 good 359
## 2 young 192
## 3 friend 166
## 4 hope 143
## 5 happy 125
## 6 love 117
## 7 deal 92
## 8 found 92
## 9 present 89
## 10 kind 82
## # ... with 293 more rows
jane_austen_sentiment <-
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book,index = linenumber %/% 80,sentiment) %>%
pivot_wider(names_from = sentiment,values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)## Joining, by = "word"
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")Comparing the three sentiment dictionaries
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")## Joining, by = "word"
bing_and_nrc <-
bind_rows(pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)## Joining, by = "word"
## Joining, by = "word"
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 3324
## 2 positive 2312
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
Most common positive and negative words
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()## Joining, by = "word"
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)Wordclouds
## Joining, by = "word"
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20"
, "gray80"),
max.words = 100)## Joining, by = "word"
Looking at units beyond just words
p_and_p_sentences <-
tibble(text = prideprejudice) %>%
unnest_tokens(sentence,
text,
token = "sentences")## [1] "by jane austen"
austen_chapters <-
austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter,
text,
token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())## # A tibble: 6 x 2
## book chapters
## <fct> <int>
## 1 Sense & Sensibility 51
## 2 Pride & Prejudice 62
## 3 Mansfield Park 49
## 4 Emma 56
## 5 Northanger Abbey 32
## 6 Persuasion 25
bingnegative <-
get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())## `summarise()` has grouped output by 'book'. You can override using the `.groups` argument.
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
slice_max(ratio, n = 1) %>%
ungroup()## Joining, by = "word"
## `summarise()` has grouped output by 'book'. You can override using the `.groups` argument.
## # A tibble: 6 x 5
## book chapter negativewords words ratio
## <fct> <int> <int> <int> <dbl>
## 1 Sense & Sensibility 43 161 3405 0.0473
## 2 Pride & Prejudice 34 111 2104 0.0528
## 3 Mansfield Park 46 173 3685 0.0469
## 4 Emma 15 151 3340 0.0452
## 5 Northanger Abbey 21 149 2982 0.0500
## 6 Persuasion 4 62 1807 0.0343
In order to conduct a sentiment analysis I will select have to select a novel from the gutenbergr 2: This is a random test. package. Its fields can be shown below. Upon selecting a corpus I will import it according to the id #.
| gutenberg fields |
|---|
| gutenberg_id |
| title |
| author |
| gutenberg_author_id |
| language |
| gutenberg_bookshelf |
| rights |
| has_text |
gutenberg_metadata %>%
select(gutenberg_id, title,author, gutenberg_bookshelf) %>%
filter(gutenberg_id>0)%>%
rename(id = gutenberg_id, subject = gutenberg_bookshelf)%>%
reactable(
bordered = TRUE,
striped = TRUE,
highlight = TRUE,
filterable = TRUE,
showPageSizeOptions = TRUE,
showPagination = TRUE,
columns = list(
id = colDef(minWidth = 70),
title = colDef(minWidth = 400),
author = colDef(minWidth = 100),
subject = colDef(minWidth = 200)
),
fullWidth = TRUE,
defaultPageSize = 5)Ultimately I chose to use the author of The Legend of Sleepy Hollow or id # 41, whose name is Irving, Washington 3.
(irv_txt <- gutenberg_download(c(41, 877, 1371, 1850, 2048, 3293, 7002, 7948, 7993, 7994, 8519, 8571, 13042, 13514, 13515, 14228, 19293, 20656, 21195, 32987, 36652, 38192, 49258, 49259, 49872, 49947, 50352)))## # A tibble: 245,690 x 2
## gutenberg_id text
## <int> <chr>
## 1 41 "THE LEGEND OF SLEEPY HOLLOW"
## 2 41 ""
## 3 41 ""
## 4 41 "by Washington Irving"
## 5 41 ""
## 6 41 ""
## 7 41 ""
## 8 41 ""
## 9 41 ""
## 10 41 "FOUND AMONG THE PAPERS OF THE LATE DIEDRICH KNICKERBOCKER."
## # ... with 245,680 more rows
Tidy
I use stop_words in the anti_join() function from the dplyr package to remove stop words from my download, and tidy my data so that its gutenberg_id, linenumber and word is stored. I then get a count of each word and sort accordingly. This and all proceeding functions are heavily reference from the examples located in Text Mining with R
tdy_irv_txt <-
irv_txt %>%
mutate(linenumber = row_number()) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
irv_cnt<-
tdy_irv_txt %>%
count(word, sort = TRUE)Bing
Using bing, I’m concerned with the sentiment based on the word. Following the example from 2.2 exactly provides the below results
(tdy_irv_bng <-
tdy_irv_txt %>%
inner_join(get_sentiments("bing")) %>%
count(word,
index = linenumber %/% 80,
sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative))## # A tibble: 121,200 x 5
## word index negative positive sentiment
## <chr> <dbl> <int> <int> <int>
## 1 abolish 1169 1 0 -1
## 2 abolish 2915 1 0 -1
## 3 abominable 149 1 0 -1
## 4 abominable 1338 1 0 -1
## 5 abominable 1396 1 0 -1
## 6 abominable 1446 1 0 -1
## 7 abominable 2896 1 0 -1
## 8 abominably 918 1 0 -1
## 9 abominably 1770 1 0 -1
## 10 abominably 1829 1 0 -1
## # ... with 121,190 more rows
I found a full count more useful however
(tdy_irv_bng_cnts <-
tdy_irv_txt %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup())## # A tibble: 4,086 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 enemy negative 1122
## 2 poor negative 834
## 3 love positive 762
## 4 death negative 717
## 5 master positive 705
## 6 worthy positive 591
## 7 strong positive 566
## 8 beautiful positive 552
## 9 attack negative 539
## 10 ready positive 523
## # ... with 4,076 more rows
NRC
The above mentioned pipe with innerjoin command can also provide a very functional way to utilize NRC sentiment.
(nrc_cnts <-
tdy_irv_txt %>%
inner_join(get_sentiments("nrc")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup())## # A tibble: 10,958 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 time anticipation 3989
## 2 king positive 1922
## 3 found joy 1620
## 4 found positive 1620
## 5 found trust 1620
## 6 don positive 1206
## 7 don trust 1206
## 8 enemy anger 1122
## 9 enemy disgust 1122
## 10 enemy fear 1122
## # ... with 10,948 more rows
if we use the nrc_joy filtered sentiment from 2.2 on gutenberg_id number 41 for \(The\ Legend\ of\ Sleepy\ Hollow\) the results are as follows
## Joining, by = "word"
## # A tibble: 133 x 2
## word n
## <chr> <int>
## 1 church 14
## 2 tree 14
## 3 found 10
## 4 favorite 9
## 5 white 8
## 6 hero 7
## 7 green 6
## 8 companion 5
## 9 delight 5
## 10 true 5
## # ... with 123 more rows
Here we test the presence of afinnn, bing and nrc in Washington Irvin’s work. the afinn and bing_and_nrc variables are overwritten with our new results.
afinn <-
tdy_irv_txt %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 5) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")bing_and_nrc <-
bind_rows(
tdy_irv_txt %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
tdy_irv_txt %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method,
index = linenumber %/% 5,
sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")From here gutenberg_id number 41 for \(The\ Legend\ of\ Sleepy\ Hollow\) is again used.
## # A tibble: 5,051 x 3
## gutenberg_id linenumber word
## <int> <int> <chr>
## 1 41 1 legend
## 2 41 1 sleepy
## 3 41 1 hollow
## 4 41 4 washington
## 5 41 4 irving
## 6 41 10 found
## 7 41 10 papers
## 8 41 10 late
## 9 41 10 diedrich
## 10 41 10 knickerbocker
## # ... with 5,041 more rows
afinn_sh <- Sleepy_Hollow %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 5) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")bing_and_nrc_sh <- bind_rows(
Sleepy_Hollow %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
Sleepy_Hollow %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 5, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)bind_rows(afinn_sh,
bing_and_nrc_sh) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")Again over writing variable bing_word_counts
(bing_word_counts <-
tdy_irv_txt %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup())## # A tibble: 4,086 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 enemy negative 1122
## 2 poor negative 834
## 3 love positive 762
## 4 death negative 717
## 5 master positive 705
## 6 worthy positive 591
## 7 strong positive 566
## 8 beautiful positive 552
## 9 attack negative 539
## 10 ready positive 523
## # ... with 4,076 more rows
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)set.seed(528)
tdy_irv_txt %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100,
rot.per=0.35,
colors=brewer.pal(7, "Accent")))tdy_irv_txt %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("red", "blue"),
max.words = 100)For this portion I decided to work with the Syuzhet Package and follow the material provided in the syuzhet-vignette4, which involves:
get_text_as_string()get_tokens()get_sentences()get_sentiment() on the collected sentence or tokensget_sentiment() e.g.:
get_sentiment(token, method="syuzhet")get_sentiment(token, method = "bing")get_sentiment(token, method = "afinn")get_sentiment(token, method = "nrc", lang = "english")get_sentiment(sentence_vector)For it’s usage I will again use \(The\ Legend\ of\ Sleepy\ Hollow\) located here5
# import Library
library(syuzhet)
# Retrieve text
sh_book <- get_text_as_string("https://www.gutenberg.org/files/41/41.txt")
# create sentence vector
str(sh_s_v<- get_sentences(sh_book))
# Create Tokens
book_v <- get_tokens(sh_book, pattern = "\\W")
# create vectors
syuzhet_vector <- get_sentiment(book_v, method="syuzhet")
bing_vector <- get_sentiment(book_v, method = "bing")
afinn_vector <- get_sentiment(book_v, method = "afinn")
nrc_vector <- get_sentiment(book_v, method = "nrc", lang = "english")
s_v_sentiment <- get_sentiment(sh_s_v)## chr [1:467] "Project Gutenberg's The Legend of Sleepy Hollow, by Washington Irving This eBook is for the use of anyone anyw"| __truncated__ ...
The heads of which in is shown in the 4x6 matrix below.
rbind(
sign(head(syuzhet_vector)),
sign(head(bing_vector)),
sign(head(afinn_vector)),
sign(head(nrc_vector))
)## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0 0 0 0 0 0
## [2,] 0 0 0 0 0 0
## [3,] 0 0 0 0 0 0
## [4,] 0 0 0 0 0 0
specifically the sum, mean and summary of the syuzhet_vector is as follows
## [1] 180.85
## [1] 0.01160858
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.00000 0.00000 0.00000 0.01161 0.00000 1.00000
My goal was to use the methods noted in the vignette and see how it would change my understanding of the book Sleep Hollow if it did at all. plot() function was used with both the syuzhet_vector and s_v_sentiment my sentence vectors as a measure of the negative and positive sentiments throughout the story. X being here a measurement of time, Y varying according to time withing the story.
I unfortunately found this plot the least useful, since as the vignette notes, \('for\ visualization\ it\ is\ generally\ preferable\ to\ remove\ the\ noise\ and\ reveal\ the\ simple\ shape\ of\ the\ trajectory'\)
plot(
syuzhet_vector,
type="h",
main="syuzhet vector Plot Trajectory",
xlab = "Narrative Time",
ylab= "Emotional Valence"
)plot(
s_v_sentiment,
type="l",
main="Sentence Vector Plot Trajectory",
xlab = "Narrative Time",
ylab= "Emotional Valence"
)I feel the percentage based plot was most useful, indicat a steady rise in negativity in what I beleive to be the climax of the book and a huge shift to a positive valence, in the conclusion.
percent_vals <- get_percentage_values(syuzhet_vector, bins = 10)
plot(
percent_vals,
type="l",
main="Sleepy Hollow Using Percentage-Based Means",
xlab = "Narrative Time",
ylab= "Emotional Valence",
col="red"
)Using Percentage-Base Means echos the plot above, with more fluctuation which is more inline with how this story is particularly told in my perspective.
percent_vals <- get_percentage_values(syuzhet_vector, bins = 20)
plot(
percent_vals,
type="l",
main="Sleepy Hollow Using Percentage-Based Means",
xlab = "Narrative Time",
ylab= "Emotional Valence",
col="red"
)Visually utilization of ft_values to smooth out these representations limits the chaotic perception given from percent_vals.
ft_values <- get_transformed_values(
syuzhet_vector,
low_pass_size = 3,
x_reverse_len = 100,
padding_factor = 2,
scale_vals = TRUE,
scale_range = FALSE
)plot(
ft_values,
type ="l",
main ="Joyce's Portrait using Transformed Values",
xlab = "Narrative Time",
ylab = "Emotional Valence",
col = "red"
)However the practicality of simple_plot() would be my preference in most scenarios.
Robinson, J. S. and D. (n.d.). Text mining with r: A tidy approach. https://www.tidytextmining.com/sentiment.html.:↩︎
Search. (n.d.). Retrieved April 19, 2021, from https://www.gutenberg.org/ebooks/.:↩︎
Irving, Washington, 1783-1859. Rip Van Winkle, And The Legend of Sleepy Hollow. New York, Macmillan, 1963.:↩︎
Robinson, J. (n.d.). Text mining with r: A tidy approach. Retrieved April 19, 2021, from https://www.tidytextmining.com/:↩︎
Gutenberg. (n.d.). Retrieved April 19, 2021, from https://www.gutenberg.org/files/41/41-0.txt:↩︎