knitr::opts_chunk$set(echo = TRUE)
library(tidytext)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(janeaustenr)
library(stringr)
library(ggplot2)
library(wordcloud)
## Loading required package: RColorBrewer
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
get_sentiments("afinn")
## # A tibble: 2,477 x 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ... with 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ... with 6,776 more rows
get_sentiments("nrc")
## # A tibble: 13,901 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ... with 13,891 more rows
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE) %>%
head()
## Joining, by = "word"
## # A tibble: 6 x 2
## word n
## <chr> <int>
## 1 good 359
## 2 young 192
## 3 friend 166
## 4 hope 143
## 5 happy 125
## 6 love 117
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
pride_prejudice%>%
head()
## # A tibble: 6 x 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Pride & Prejudice 1 0 pride
## 2 Pride & Prejudice 1 0 and
## 3 Pride & Prejudice 1 0 prejudice
## 4 Pride & Prejudice 3 0 by
## 5 Pride & Prejudice 3 0 jane
## 6 Pride & Prejudice 3 0 austen
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
bing_and_nrc <- bind_rows(pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative")) %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 3324
## 2 positive 2312
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts
## # A tibble: 2,585 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 well positive 1523
## 3 good positive 1380
## 4 great positive 981
## 5 like positive 725
## 6 better positive 639
## 7 enough positive 613
## 8 happy positive 534
## 9 love positive 495
## 10 pleasure positive 462
## # ... with 2,575 more rows
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
## Selecting by n
custom_stop_words <- bind_rows(tibble(word = c("miss"),
lexicon = c("custom")),
stop_words)
custom_stop_words
## # A tibble: 1,150 x 2
## word lexicon
## <chr> <chr>
## 1 miss custom
## 2 a SMART
## 3 a's SMART
## 4 able SMART
## 5 about SMART
## 6 above SMART
## 7 according SMART
## 8 accordingly SMART
## 9 across SMART
## 10 actually SMART
## # ... with 1,140 more rows
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
## Warning in wordcloud(word, n, max.words = 100): miss could not be fit on page.
## It will not be plotted.
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining, by = "word"
PandP_sentences <- tibble(text = prideprejudice) %>%
unnest_tokens(sentence, text, token = "sentences")
PandP_sentences$sentence[2]
## [1] "however little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters."
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text, token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())
## # A tibble: 6 x 2
## book chapters
## <fct> <int>
## 1 Sense & Sensibility 51
## 2 Pride & Prejudice 62
## 3 Mansfield Park 49
## 4 Emma 56
## 5 Northanger Abbey 32
## 6 Persuasion 25
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
top_n(1) %>%
ungroup()
## Joining, by = "word"
## Selecting by ratio
## # A tibble: 6 x 5
## book chapter negativewords words ratio
## <fct> <int> <int> <int> <dbl>
## 1 Sense & Sensibility 43 161 3405 0.0473
## 2 Pride & Prejudice 34 111 2104 0.0528
## 3 Mansfield Park 46 173 3685 0.0469
## 4 Emma 15 151 3340 0.0452
## 5 Northanger Abbey 21 149 2982 0.0500
## 6 Persuasion 4 62 1807 0.0343
#Reads in the dataframe
movieraw <- "https://raw.githubusercontent.com//SudharshanShanmugasundaram//Chatbot//master//data//cornell%20movie-dialogs%20corpus//movie_lines.txt"
movielinesall <- as.data.frame(readLines(movieraw))
movielinesall <- as.data.frame(do.call(rbind, strsplit(as.character(movielinesall$`readLines(movieraw)`)," +++$+++ ",fixed = TRUE)), stringsAsFactors = FALSE)
## Warning in (function (..., deparse.level = 1) : number of columns of result is
## not a multiple of vector length (arg 539)
#Sets column names
colnames(movielinesall) <- c("LineID", "CharacterID", "MovieID", "Character_Name", "Line_Text")
#Removing "L" from the line number
movielinesall$LineID <- as.numeric(gsub("L","",movielinesall$LineID))
#Changing the data types where nessicary
movielinesall$MovieID <- as.factor(movielinesall$MovieID)
movielinesall$CharacterID <- as.factor(movielinesall$CharacterID)
movielinesall$LineID <- as.numeric(movielinesall$LineID)
#Trimming it down a little (optional)
movielines <- movielinesall
##movielines <- subset.data.frame(movielinesall, MovieID == c("m284", "m5", "m264", "m595"), select = c(LineID:Line_Text))
#Tidying up the dataset to include words in each line
tidylines <- movielines %>%
unnest_tokens(word, Line_Text)
rownames(tidylines) <- 1:nrow(tidylines)
#Looking for the count of the joy words in the whole dataset
tidylines %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE) %>%
head()
## Joining, by = "word"
## # A tibble: 6 x 2
## word n
## <chr> <int>
## 1 good 7489
## 2 love 3221
## 3 money 2733
## 4 god 2441
## 5 kind 2078
## 6 mother 1472
#Changed this to look at sentiment by line
moviesentiment <- tidylines %>%
inner_join(get_sentiments("bing")) %>%
count(MovieID, index = LineID %/% 1, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
#Going to take subset of the movies, for the sake of brevity
moviesentimentsub <- subset(moviesentiment, MovieID == c("m100", "m103", "m207", "m519"), select = c(MovieID:sentiment))
## Warning in `==.default`(MovieID, c("m100", "m103", "m207", "m519")): longer
## object length is not a multiple of shorter object length
## Warning in is.na(e1) | is.na(e2): longer object length is not a multiple of
## shorter object length
ggplot(moviesentimentsub, aes(index, sentiment, fill = MovieID)) +
geom_col(show.legend = FALSE) +
facet_wrap(~MovieID, ncol = 2, scales = "free_x")
### We can see above how the sentiment changes across dialogue lines for 4 movies that I selected at random. The scale of the sentiment isn’t as large as the textbook example as the size of each chunk of text here corrisponds to one line of dialogue, instead of a bunch of lines like in the books.
#We've changed some of the code from the example, but it's mostly just replacing the variables
heathers <- tidylines %>%
filter(MovieID == "m383")
afinn <- heathers %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = LineID %/% 1) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
bing_and_nrc <- bind_rows(heathers %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
heathers %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))) %>%
mutate(method = "NRC")) %>%
count(method, index = LineID %/% 1, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative")) %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 3324
## 2 positive 2312
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
#We've mostly changed variables again
bing_word_counts <- tidylines %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts %>%
head()
## # A tibble: 6 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 like positive 15041
## 2 right positive 10060
## 3 well positive 9958
## 4 good positive 7489
## 5 sorry negative 3290
## 6 love positive 3221
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
## Selecting by n
#The stop-words for movies are probably different than for Jane-Austen. Maybe more like this
custom_stop_words <- bind_rows(tibble(word = c("like", "right", "well", "work", "yeah"),
lexicon = c("custom")),
stop_words)
custom_stop_words %>%
head()
## # A tibble: 6 x 2
## word lexicon
## <chr> <chr>
## 1 like custom
## 2 right custom
## 3 well custom
## 4 work custom
## 5 yeah custom
## 6 a SMART
#Pretty much no changes here from the example code aside from some variables
tidylines %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
tidylines %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining, by = "word"
#This code is based off of the example, but there are significant changes
#We'll seperate out the sentances of dialog, again for the movie 'Heathers'
#It's easier in this case to just subset our main dataset 'movielines' than it is to pull a script and break it down
heatherslines <- as.data.frame(subset(movielines, MovieID == "m383", ))
#Changing this to group by character instead of chapter
heatherschars <- heatherslines %>%
group_by(CharacterID) %>%
mutate(Character_Dialogue = paste0(Line_Text, collapse = "")) %>%
distinct(heatherschars,Character_Dialogue)
## Warning: Trying to compute distinct() for variables not found in the data:
## - `heatherschars`
## This is an error, but only a warning is raised for compatibility reasons.
## The following variables will be used:
## - Character_Dialogue
#No changes here
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
#Changed this to reflect characters' words instead of chapers in books
heatherswordcounts <- heathers %>%
group_by(CharacterID, Character_Name) %>%
summarize(words = n())
heathers %>%
semi_join(bingnegative) %>%
group_by(CharacterID) %>%
summarize(negativewords = n()) %>%
left_join(heatherswordcounts, by = "CharacterID") %>%
mutate(ratio = negativewords/words) %>%
arrange(desc(ratio)) %>%
ungroup()
## Joining, by = "word"
## # A tibble: 15 x 5
## CharacterID negativewords Character_Name words ratio
## <fct> <int> <chr> <int> <dbl>
## 1 u5796 8 KURT 90 0.0889
## 2 u5790 6 DAD 69 0.0870
## 3 u5801 8 WHITNEY JAMES 140 0.0571
## 4 u5797 3 MOM 55 0.0545
## 5 u5793 17 HEATHER DUKE 323 0.0526
## 6 u5791 3 EARL 61 0.0492
## 7 u5800 94 VERONICA 2014 0.0467
## 8 u5787 4 BETTY 88 0.0455
## 9 u5795 59 J.D. 1299 0.0454
## 10 u5794 7 HEATHER MCNAMARA 161 0.0435
## 11 u5799 4 RAM 104 0.0385
## 12 u5789 1 COURTNEY 28 0.0357
## 13 u5792 18 HEATHER CHANDLER 544 0.0331
## 14 u5798 2 PAULINE 71 0.0282
## 15 u5788 1 BRAD 81 0.0123
#Nearly the same code from the last chunk
loughrannegative <- get_sentiments("loughran") %>%
filter(sentiment == "negative")
heatherswordcounts <- heathers %>%
group_by(CharacterID, Character_Name) %>%
summarize(words = n())
heathers %>%
semi_join(loughrannegative) %>%
group_by(CharacterID) %>%
summarize(negativewords = n()) %>%
left_join(heatherswordcounts, by = "CharacterID") %>%
mutate(ratio = negativewords/words) %>%
arrange(desc(ratio)) %>%
ungroup()
## Joining, by = "word"
## # A tibble: 12 x 5
## CharacterID negativewords Character_Name words ratio
## <fct> <int> <chr> <int> <dbl>
## 1 u5801 6 WHITNEY JAMES 140 0.0429
## 2 u5789 1 COURTNEY 28 0.0357
## 3 u5790 2 DAD 69 0.0290
## 4 u5796 2 KURT 90 0.0222
## 5 u5797 1 MOM 55 0.0182
## 6 u5791 1 EARL 61 0.0164
## 7 u5795 20 J.D. 1299 0.0154
## 8 u5794 2 HEATHER MCNAMARA 161 0.0124
## 9 u5793 4 HEATHER DUKE 323 0.0124
## 10 u5787 1 BETTY 88 0.0114
## 11 u5800 21 VERONICA 2014 0.0104
## 12 u5792 4 HEATHER CHANDLER 544 0.00735
litigiousness <- tidylines %>%
inner_join(get_sentiments("loughran")) %>%
count(MovieID, index = LineID %/% 1, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
select(MovieID, litigious) %>%
group_by(MovieID) %>%
summarise(sum = sum(litigious), n = n()) %>%
mutate(Litigousness = sum / n) %>%
arrange(desc(Litigousness)) %>%
ungroup()
## Joining, by = "word"
ggplot(head(litigiousness, 5), aes(y = reorder(MovieID, -Litigousness), x = Litigousness,fill = Litigousness)) +
geom_col(show.legend = FALSE) +
scale_y_discrete(labels = c("Verdict", "The Verdict", "True Believer", "The Usual Suspects", "Romeo and Juliet")) +
labs(x = "Litigousness",
y = "Movie Title") +
coord_flip()