Get the primary example code from Chapter 2 of Text Mining with R working and provide a citation for this base code.
First, let’s take a look at the sentiment analysis conducted in Chapter 2 of Text Mining with R:
library(janeaustenr)
## Warning: package 'janeaustenr' was built under R version 4.0.4
library(stringr)
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
# I've thus assigned a word per row in this dataset, and now I can attempt some sentiment analysis.
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 303 x 2
## word n
## <chr> <int>
## 1 good 359
## 2 young 192
## 3 friend 166
## 4 hope 143
## 5 happy 125
## 6 love 117
## 7 deal 92
## 8 found 92
## 9 present 89
## 10 kind 82
## # ... with 293 more rows
library(tidyr)
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
library(ggplot2)
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
pride_prejudice
## # A tibble: 122,204 x 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Pride & Prejudice 1 0 pride
## 2 Pride & Prejudice 1 0 and
## 3 Pride & Prejudice 1 0 prejudice
## 4 Pride & Prejudice 3 0 by
## 5 Pride & Prejudice 3 0 jane
## 6 Pride & Prejudice 3 0 austen
## 7 Pride & Prejudice 7 1 chapter
## 8 Pride & Prejudice 7 1 1
## 9 Pride & Prejudice 10 1 it
## 10 Pride & Prejudice 10 1 is
## # ... with 122,194 more rows
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
## `summarise()` ungrouping output (override with `.groups` argument)
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
Next we can distill the dataset to the most common positive and negative words
get_sentiments("nrc") %>%
filter(sentiment %in% c(
"positive",
"negative"
)) %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 3324
## 2 positive 2312
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts
## # A tibble: 2,585 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 well positive 1523
## 3 good positive 1380
## 4 great positive 981
## 5 like positive 725
## 6 better positive 639
## 7 enough positive 613
## 8 happy positive 534
## 9 love positive 495
## 10 pleasure positive 462
## # ... with 2,575 more rows
Lastly, let’s look at it graphically
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(
y = "Contribution to sentiment",
x = NULL
) +
coord_flip()
## Selecting by n
# Let's filter out the stop words and produce a word cloud
custom_stop_words <- bind_rows(
tibble(
word = c("miss"),
lexicon = c("custom")
),
stop_words
)
#custom_stop_words
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.0.4
## Loading required package: RColorBrewer
## Warning: package 'RColorBrewer' was built under R version 4.0.3
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
Next we can discern between positive or negative words in our wordcloud
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining, by = "word"
Let’s look at the units beyond words
p_and_p_sentences <- tibble(text = prideprejudice) %>%
unnest_tokens(sentence, text, token = "sentences")
p_and_p_sentences$sentence[2]
## [1] "by jane austen"
#> [1] "by jane austen"
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text, token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 2
## book chapters
## <fct> <int>
## 1 Sense & Sensibility 51
## 2 Pride & Prejudice 62
## 3 Mansfield Park 49
## 4 Emma 56
## 5 Northanger Abbey 32
## 6 Persuasion 25
#> # A tibble: 6 x 2
#> book chapters
#> <fct> <int>
#> 1 Sense & Sensibility 51
#> 2 Pride & Prejudice 62
#> 3 Mansfield Park 49
#> 4 Emma 56
#> 5 Northanger Abbey 32
#> 6 Persuasion 25
This dataset comes from kaggle and is a collection of customer sentiment pulled from Rotten Tomatoes. It includes language from the most popular review for each movie.
reviews <- read.csv("https://raw.githubusercontent.com/evanmclaughlin/ECM607/master/RT_movie_reviews.csv")
head(reviews)
## rotten_tomatoes_link
## 1 m/0814255
## 2 m/0878835
## 3 m/10
## 4 m/1000013-12_angry_men
## 5 m/1000079-20000_leagues_under_the_sea
## 6 m/10000_bc
## movie_title
## 1 Percy Jackson & the Olympians: The Lightning Thief
## 2 Please Give
## 3 10
## 4 12 Angry Men (Twelve Angry Men)
## 5 20,000 Leagues Under The Sea
## 6 10,000 B.C.
## movie_info
## 1 Always trouble-prone, the life of teenager Percy Jackson (Logan Lerman) gets a lot more complicated when he learns he's the son of the Greek god Poseidon. At a training ground for the children of deities, Percy learns to harness his divine powers and prepare for the adventure of a lifetime: he must prevent a feud among the Olympians from erupting into a devastating war on Earth, and rescue his mother from the clutches of Hades, god of the underworld.
## 2 Kate (Catherine Keener) and her husband Alex (Oliver Platt) are wealthy New Yorkers who prowl estate sales and make a tidy profit reselling items they bought cheaply. They buy the apartment next door and plan to remodel just as soon as its current occupant, a cranky old woman, dies. Kate is troubled by the way she and her husband earn a living, and tries to assuage her guilt by befriending her tenant and the woman's granddaughters, but her overtures lead to unexpected consequences.
## 3 A successful, middle-aged Hollywood songwriter falls hopelessly in love with the woman of his dreams, and even follows the girl and her new husband to their Mexican honeymoon resort. While his behavior seems sure to land him in trouble, out of the blue fate plays into his hands.
## 4 Following the closing arguments in a murder trial, the 12 members of the jury must deliberate, with a guilty verdict meaning death for the accused, an inner-city teen. As the dozen men try to reach a unanimous decision while sequestered in a room, one juror (Henry Fonda) casts considerable doubt on elements of the case. Personal issues soon rise to the surface, and conflict threatens to derail the delicate process that will decide one boy's fate.
## 5 In 1866, Professor Pierre M. Aronnax (Paul Lukas) and his assistant Conseil (Peter Lorre), stranded in San Francisco by reports of a giant sea monster attacking ships in the Pacific Ocean, are invited to join an expedition to search for the creature. During the search, they and harpooner Ned Land (Kirk Douglas) are thrown overboard during an attack, eventually discovering that the supposed monster is actually a submarine piloted by the brilliant but haunted Captain Nemo (James Mason).
## 6 Mammoth hunter D'Leh (Steven Strait) has long been in love with a beautiful, blue-eyed tribeswoman named Evolet (Camilla Belle). After horseback-riding raiders kidnap most of his D'Leh's fellow tribesmen as well as Evolet, he sets out on a dangerous trek to rescue her from her captors.
## critics_consensus
## 1 Though it may seem like just another Harry Potter knockoff, Percy Jackson benefits from a strong supporting cast, a speedy plot, and plenty of fun with Greek mythology.
## 2 Nicole Holofcener's newest might seem slight in places, but its rendering of complex characters in a conflicted economic landscape is varied, natural, and touching all the same.
## 3 Blake Edwards' bawdy comedy may not score a perfect 10, but Dudley Moore's self-deprecating performance makes this midlife crisis persistently funny.
## 4 Sidney Lumet's feature debut is a superbly written, dramatically effective courtroom thriller that rightfully stands as a modern classic.
## 5 One of Disney's finest live-action adventures, 20,000 Leagues Under the Sea brings Jules Verne's classic sci-fi tale to vivid life, and features an awesome giant squid.
## 6 With attention strictly paid to style instead of substance, or historical accuracy, 10,000 B.C. is a visually impressive but narratively flimsy epic.
## content_rating genres
## 1 PG Action & Adventure, Comedy, Drama, Science Fiction & Fantasy
## 2 R Comedy
## 3 R Comedy, Romance
## 4 NR Classics, Drama
## 5 G Action & Adventure, Drama, Kids & Family
## 6 PG-13 Action & Adventure, Classics, Drama
## directors authors
## 1 Chris Columbus Craig Titley, Chris Columbus, Rick Riordan
## 2 Nicole Holofcener Nicole Holofcener
## 3 Blake Edwards Blake Edwards
## 4 Sidney Lumet Reginald Rose
## 5 Richard Fleischer Earl Felton
## 6 Roland Emmerich Harald Kloser, Roland Emmerich
## actors
## 1 Logan Lerman, Brandon T. Jackson, Alexandra Daddario, Jake Abel, Sean Bean, Pierce Brosnan, Steve Coogan, Rosario Dawson, Melina Kanakaredes, Catherine Keener, Kevin Mckidd, Joe Pantoliano, Uma Thurman, Ray Winstone, Julian Richings, Bonita Friedericy, Annie Ilonzeh, Tania Saulnier, Marie Avgeropoulos, Luisa D'Oliveira, Christie Laing, Marielle Jaffe, Elisa King, Chrystal Tisiga, Alexis Knapp, Charlie Gallant, Chelan Simmons, Andrea Brooks, Natassia Malthe, Max Van Ville, Serinda Swan, Dimitri Lekkos, Ona Grauer, Stefanie von Pfetten, Conrad Coates, Erica Cerra, Dylan Neal, Luke Camilleri, Holly Hougham, Ina Geraldine, Raquel Riskin, Yusleidis Oquendo, Janine Edwards, Valerie Tian, Violet Columbus, Sarah Smyth, Merritt Patterson, Julie Luck, Andrea Day, John Stewart, Suzanne Ristic, Deejay Jackson, Matthew Garrick, Stan Carp, Suzanna Ristic, Richard Harmon, Maria Olsen, Robin Lemon, Doyle Devereux, Tom Pickett, VJ Delos-Reyes, Tim Aas, Keith Dallas, Spencer Atkinson, Maya Washington, Loyd Bateman, Victor Ayala, Zane Holtz, Eli Zagoudakis, Matt Reimer, Rob Hayter, Lloyd Bateman, Shawn Beaton, Jarod Joseph, Reilly Dolman, Paul Cummings, Julie Brar, Dejan Loyola, Damian Arman, Mario Casoria, Dorla Bell, Carolyn Adair (II), Jade Pawluk, G. Patrick Currie, Darian Arman, Mariela Zapata, David L. Smith
## 2 Catherine Keener, Amanda Peet, Oliver Platt, Rebecca Hall, Sarah Steele, Ann Morgan Guilbert, Kevin Corrigan, Lois Smith
## 3 Dudley Moore, Bo Derek, Julie Andrews, Robert Webber, Dee Wallace, Sam Jones III, Sam J. Jones, Brian Dennehy, Max Showalter, Rad Daly, Nedra Volz, James Noble, Virginia Kiser, John Hawker, Deborah Rush, Don Calfa, Walter George Alton, Annette Martin, John Hancock, Lorry Goldman, Arthur Rosenberg, Mari Gorman, Marcy Hanson, Julia Jennings, Senilo Tanney, Kitty DeCarlo, William Lucking, Owen Sullivan, Debbie White, Laurence Carr, Camila Ashland, Adrian Aron, Gail Bowman, Burke Byrnes, Michael Champion, Doug Sheehan, J. Victor Lopez, Gregory Chase, Jon Linton, Ellen Clark, Antonia Ellis, John Chappell, Lynn Farrell, Art Kassul, Denise Crosby, Jeanetta Arnette
## 4 Martin Balsam, John Fiedler, Lee J. Cobb, E.G. Marshall, Jack Klugman, Edward Binns, Jack Warden, Henry Fonda, Joseph Sweeney, Ed Begley Sr., George Voskovec, Robert Webber, Rudy Bond, James A. Kelly, Billy Nelson, John Savoca
## 5 James Mason, Kirk Douglas, Paul Lukas, Peter Lorre, Robert J. Wilke, Carleton Young, Bob Wilke, Ted de Corsia, Percy Helton, Ted Cooper, Eddie Marr, Fred Graham, Esmeralda, J.M. Kerrigan, Harry Harvey, Herb Vigran
## 6 Steven Strait, Camilla Belle, Cliff Curtis, Joel Virgel, Affif Ben Badra, Mo Zainal, Nathanael Baring, Marco Khan, Reece Ritchie, Joel Fry, Junior Oliphant, Louise Tu'u, David Dennis, Kristian Beazley, Mona Hammond, Jacob Renton, Grayson Hunt Urwin, Fahruq Valley-Omar, Boubacar Badaine, Joe Vaz, Charles Baloyi, Tim Barlow, Gabriel Malema, Mark Simmons, Hannah Westbury, Affif Ben Badra, Mo Zinal, Omar Sharif, Farouk Valley-Omar
## original_release_date streaming_release_date runtime production_company
## 1 2/12/2010 11/25/2015 119 20th Century Fox
## 2 4/30/2010 9/4/2012 90 Sony Pictures Classics
## 3 10/5/1979 7/24/2014 122 Waner Bros.
## 4 4/13/1957 1/13/2017 95 Criterion Collection
## 5 1/1/1954 6/10/2016 127 Disney
## 6 3/7/2008 6/22/2013 109 Warner Bros. Pictures
## tomatometer_status tomatometer_rating tomatometer_count audience_status
## 1 Rotten 49 149 Spilled
## 2 Certified-Fresh 87 142 Upright
## 3 Fresh 67 24 Spilled
## 4 Certified-Fresh 100 54 Upright
## 5 Fresh 89 27 Upright
## 6 Rotten 8 149 Spilled
## audience_rating audience_count tomatometer_top_critics_count
## 1 53 254421 43
## 2 64 11574 44
## 3 53 14684 2
## 4 97 105386 6
## 5 74 68918 5
## 6 37 411140 37
## tomatometer_fresh_critics_count tomatometer_rotten_critics_count
## 1 73 76
## 2 123 19
## 3 16 8
## 4 54 0
## 5 24 3
## 6 12 137
# I'm going to remove some columns and retain the data that I really want to use for sentiment analysis
dataset <- subset(reviews, select = c(movie_title, critics_consensus))
head(dataset)
## movie_title
## 1 Percy Jackson & the Olympians: The Lightning Thief
## 2 Please Give
## 3 10
## 4 12 Angry Men (Twelve Angry Men)
## 5 20,000 Leagues Under The Sea
## 6 10,000 B.C.
## critics_consensus
## 1 Though it may seem like just another Harry Potter knockoff, Percy Jackson benefits from a strong supporting cast, a speedy plot, and plenty of fun with Greek mythology.
## 2 Nicole Holofcener's newest might seem slight in places, but its rendering of complex characters in a conflicted economic landscape is varied, natural, and touching all the same.
## 3 Blake Edwards' bawdy comedy may not score a perfect 10, but Dudley Moore's self-deprecating performance makes this midlife crisis persistently funny.
## 4 Sidney Lumet's feature debut is a superbly written, dramatically effective courtroom thriller that rightfully stands as a modern classic.
## 5 One of Disney's finest live-action adventures, 20,000 Leagues Under the Sea brings Jules Verne's classic sci-fi tale to vivid life, and features an awesome giant squid.
## 6 With attention strictly paid to style instead of substance, or historical accuracy, 10,000 B.C. is a visually impressive but narratively flimsy epic.
First, I’ll tokenize the critics’ consensus
token <- dataset %>%
unnest_tokens(word, 2) %>%
anti_join(stop_words)
## Joining, by = "word"
token_count <- token %>%
count(word) %>%
arrange(desc(n))
head(token_count)
## word n
## 1 story 827
## 2 cast 709
## 3 performance 677
## 4 performances 676
## 5 director 641
## 6 movie 542
There are some words at the top here that don’t add much to our analysis, so I’ll customize my stop word list to improve the analysis.
new_stop <- data.frame(word = c("story", "cast", "performance","performances", "director", "movie", "film", "life", "offers", "makes", "characters", "fans", "viewers", "genre", "direction", "writer", "script", "tale", "ultimately", "material", "effects", "character", "time", "subject", "features", "title", "plot", "comedy", "drama", "thriller", "action", "horror", "acted", "premise"), lexicon = "custom")
my_stopwords <- rbind(new_stop, stop_words)
critic <- token %>%
filter(!word %in% my_stopwords$word)
critic_count <- critic %>%
count(word, sort = TRUE)
head(critic_count)
## word n
## 1 funny 348
## 2 strong 313
## 3 entertaining 293
## 4 humor 286
## 5 classic 265
## 6 talented 242
That’s a pretty good collection of sentiment. Let’s introduce a lexicon to bucket the sentiments as either positive or negative.
get_sentiments("bing")
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ... with 6,776 more rows
# I'll now run my sentiment dataset against bing
critic_bing_pct <- critic %>%
inner_join(get_sentiments("bing")) %>%
filter(sentiment %in% c(
"positive",
"negative"
)) %>%
count(sentiment) %>%
mutate(total = n / sum(n))
## Joining, by = "word"
critic_bing_pct
## sentiment n total
## 1 negative 11547 0.4280154
## 2 positive 15431 0.5719846
This is especially surprising. The critics’ job is criticism after all. Let’s take a look at this graphically.
bing_word_counts <- critic %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
Pretty strange that bing buckets “funny” as a negative, and it’s especially a shame that it contributes disproportionately to the negative sentiment, but our analysis at least highlights a problem with using bing for movie review purposes!