Datasets are downloaded from:
https://assets.datacamp.com/production/repositories/591/datasets/0c20fb94d054c9b86c264efd7e3f4735f26c3587/all_book_polarity.rds https://assets.datacamp.com/production/repositories/591/datasets/7041f7019b83dae070bc2ded9213f8f0c611fbe3/all_books.rds https://assets.datacamp.com/production/repositories/591/datasets/5e012008bbcfb1897be17accadf5718c9b036bb9/all_tdm.rds https://assets.datacamp.com/production/repositories/591/datasets/8514c01e859182c749d07276a3af50b3c192eed1/bos_pol.rds https://assets.datacamp.com/production/repositories/591/datasets/87fb7cd77a6bc8b22ba2aa4f150132b66b57483c/bos_reviews.rds http://www.gutenberg.org/cache/epub/14417/pg14417.txt
library(qdap)
library(dplyr)
library(tidyverse)
library(tm)
library(magrittr)
library(lexicon)
library(metricsgraphics)
library(broom)
library(tidytext)
library(textdata)
library(ggthemes)
library(wordcloud)
library(radarchart)
library(treemap)
Sentiment analysis is the process of extracting an author’s emotional intent from text.
Bag of words analysis methods are only simply organizing text and performing analysis without regard to authors emotions.
In this course: still use of similar tools (TDM, DTM, vector based analysis, etc.) but also Tidy tibble.
In this example you will use the magrittr package’s dollar pipe operator %$%. The dollar sign forwards the data frame into polarity() and you declare a text column name or the text column and a grouping variable without quotes.
# library(qdap) is loaded
# library(magrittr) is loaded
text_df <- read.csv("Datasets/polarity_text_df.csv", stringsAsFactors = FALSE)
text_df
## person
## 1 Nick
## 2 Jonathan
## 3 Martijn
## 4 Nicole
## 5 Nick
## 6 Jonathan
## 7 Martijn
## 8 Nicole
## text
## 1 DataCamp courses are the best
## 2 I like talking to students
## 3 Other online data science curricula are boring.
## 4 What is for lunch?
## 5 DataCamp has lots of great content!
## 6 Students are passionate and are excited to learn
## 7 Other data science curriculum is hard to learn and difficult to understand
## 8 I think the food here is good.
# Calc overall polarity score
text_df %$% polarity(text)
## all total.sentences total.words ave.polarity sd.polarity stan.mean.polarity
## 1 all 8 54 0.179 0.452 0.396
The equation used by the algorithm to assign value to polarity of each sentence fist utilizes the sentiment dictionary (Hu and Liu, 2004) to tag polarized words.
# Calc polarity score by person
(datacamp_conversation <- text_df %$% polarity(text, person))
## person total.sentences total.words ave.polarity sd.polarity stan.mean.polarity
## 1 Jonathan 2 13 0.577 0.184 3.141
## 2 Martijn 2 19 -0.478 0.141 -3.388
## 3 Nick 2 11 0.428 0.028 15.524
## 4 Nicole 2 11 0.189 0.267 0.707
counts(datacamp_conversation)
## person wc polarity pos.words neg.words text.var
## 1 Nick 5 0.447 best - DataCamp courses are the best
## 2 Jonathan 5 0.447 like - I like talking to students
## 3 Martijn 7 -0.378 - boring Other online data science curricula are boring.
## 4 Nicole 4 0.000 - - What is for lunch?
## 5 Nick 6 0.408 great - DataCamp has lots of great content!
## 6 Jonathan 8 0.707 passionate, excited - Students are passionate and are excited to learn
## 7 Martijn 12 -0.577 - hard, difficult Other data science curriculum is hard to learn and difficult to understand
## 8 Nicole 7 0.378 good - I think the food here is good.
scores(datacamp_conversation)
## person total.sentences total.words ave.polarity sd.polarity stan.mean.polarity
## 1 Jonathan 2 13 0.577 0.184 3.141
## 2 Martijn 2 19 -0.478 0.141 -3.388
## 3 Nick 2 11 0.428 0.028 15.524
## 4 Nicole 2 11 0.189 0.267 0.707
# Plot the conversation polarity
plot(datacamp_conversation)
VectorSource().VCorpus().removePunctuation() and stripWhitespace() from tm, and replace_abbreviation() from qdapDerine custom clean_corpus function and tm_define text:
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, content_transformer(replace_abbreviation))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, c(stopwords("en"), "coffee"))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, stripWhitespace)
return(corpus)
}
tm_define <- c("Text mining is the process of distilling actionable insights from text.",
"Sentiment analysis represents the set of tools to extract an author's feelings towards a subject.")
Make corpus & clean :
# Create a VectorSource
tm_vector <- VectorSource(tm_define)
# Apply VCorpus
tm_corpus <- VCorpus(tm_vector)
# Examine the first document's contents
content(tm_corpus[[1]])
## [1] "Text mining is the process of distilling actionable insights from text."
# Clean the text
tm_clean <- clean_corpus(tm_corpus)
# Reexamine the contents of the first doc
content(tm_clean[[1]])
## [1] "text mining process distilling actionable insights text"
Prepare data to match one used in a course (use coffee tweets from text mining course)
coffee <- read.csv("Datasets/coffee.csv", stringsAsFactors = FALSE)
coffee <- coffee$text
coffee <- VCorpus(VectorSource(coffee))
clean_text <- clean_corpus(coffee)
Create DTM. Note the frequency value of the word “working.”
# Create tf_dtm
tf_dtm <- DocumentTermMatrix(clean_text)
# Create tf_dtm_m
tf_dtm_m <- as.matrix(tf_dtm)
# Dimensions of DTM matrix
dim(tf_dtm_m)
## [1] 1000 3098
A subjectivity lexicon is a predefined list of words associated with emotional context.
qdap’s polarity() uses lexicon from hash_sentiment_huliutidytext has a sentiments tibble with
lexicon package available_data()Why lexicons are working despite quite limited number of words (typically <1000) * Zipf’s law: the frequency of any word is inversely proportional to its rank in the frequency table. Thus the most frequent word will occur approximately twice as often as the second most frequent word, three times as often as the third most frequent word, etc. * Principle of Least Effort: If there are several ways of achieving the same goal, people will choose the least demanding course of action
Although Zipf observed a steep and predictable decline in word usage you may not buy into Zipf’s law. You may be thinking “I know plenty of words, and have a distinctive vocabulary”. That may be the case, but the same can’t be said for most people! To prove it, let’s construct a visual from 3 million tweets mentioning “#sb”.
# library(metricsgraphics) is loaded
sb_words <- dget("Datasets/sb_words")
head(sb_words)
## word freq rank
## 1 sb 1984423 1
## 2 rt 1700564 2
## 3 the 1101899 3
## 4 to 588803 4
## 5 a 428598 5
## 6 for 388390 6
# Create a new column expectations by dividing the largest word frequency, freq[1], by the rank column
sb_words$expectations <- sb_words %$%
{freq[1] / rank}
Make plot:
# Create metrics plot
sb_plot <- mjs_plot(sb_words, x = rank, y = freq, show_rollover_text = FALSE)
# Add 1st line
sb_plot <- mjs_line(sb_plot)
# Add 2nd line
sb_plot <- mjs_add_line(sb_plot, expectations)
# Add legend
sb_plot <- mjs_add_legend(sb_plot, legend = c("Frequency", "Expectation"))
# Display plot
sb_plot
The function scans the text to identify words in the lexicon. It then creates a cluster around an identified subjectivity word. Within the cluster valence shifters adjust the score. Valence shifters are words that amplify or negate the emotional intent of the subjectivity word. For example, “well known” is positive while “not well known” is negative. Here “not” is a negating term and reverses the emotional intent of “well known.” In contrast, “very well known” employs an amplifier increasing the positive intent.
The polarity() function then calculates a score using subjectivity terms, valence shifters and the total number of words in the passage. This exercise demonstrates a simple polarity calculation. In the next video we look under the hood of polarity() for more detail.
# Example statement
positive <- "DataCamp courses are good for learning"
# Calculate polarity of statement
(pos_score <-polarity(positive))
## all total.sentences total.words ave.polarity sd.polarity stan.mean.polarity
## 1 all 1 6 0.408 NA NA
# Get counts
(pos_counts <- counts(pos_score))
## all wc polarity pos.words neg.words text.var
## 1 all 6 0.408 good - DataCamp courses are good for learning
# Number of positive words
n_good <- length(pos_counts$pos.words[[1]])
# Total number of words
n_words <- pos_counts$wc
# Verify polarity score
n_good / sqrt(n_words)
## [1] 0.4082483
Context cluster glossary:
Amplifying word adds 0.8 to a positive word in polarity() so the positive score becomes 1.8. For negative words 0.8 is subtracted so the total becomes -1.8. Then the score is divided by the square root of the total number of words.
Consider the following example from Frank Sinatra:
“It was a very good year”
A negating word such as “not” will inverse the subjectivity score. Consider the following example from Bobby McFerrin:
“Don’t worry Be Happy”
# Negators
negation.words
## [1] "ain't" "aren't" "can't" "couldn't" "didn't" "doesn't"
## [7] "don't" "hasn't" "isn't" "mightn't" "mustn't" "neither"
## [13] "never" "no" "nobody" "nor" "not" "shan't"
## [19] "shouldn't" "wasn't" "weren't" "won't" "wouldn't"
# Amplifiers
amplification.words
## [1] "acute" "acutely" "certain" "certainly"
## [5] "colossal" "colossally" "deep" "deeply"
## [9] "definite" "definitely" "enormous" "enormously"
## [13] "extreme" "extremely" "great" "greatly"
## [17] "heavily" "heavy" "high" "highly"
## [21] "huge" "hugely" "immense" "immensely"
## [25] "incalculable" "incalculably" "massive" "massively"
## [29] "more" "particular" "particularly" "purpose"
## [33] "purposely" "quite" "real" "really"
## [37] "serious" "seriously" "severe" "severely"
## [41] "significant" "significantly" "sure" "surely"
## [45] "true" "truly" "vast" "vastly"
## [49] "very"
# De-amplifiers
deamplification.words
## [1] "barely" "faintly" "few" "hardly" "little"
## [6] "only" "rarely" "seldom" "slightly" "sparsely"
## [11] "sporadically" "very few" "very little"
(text <- dget("Datasets/text"))
## speaker
## 1 beyonce
## 2 jay_z
## words
## 1 I know I dont understand Just how your love can do what no one else can
## 2 They cant figure him out they like hey, is he insane
# Complete the polarity parameters
polarity(
text.var = text$words,
grouping.var = text$speaker,
polarity.frame = key.pol,
negators = negation.words,
amplifiers = amplification.words,
deamplifiers = deamplification.words
)
## speaker total.sentences total.words ave.polarity sd.polarity stan.mean.polarity
## 1 beyonce 1 16 0.25 NA NA
## 2 jay_z 1 11 0.00 NA NA
key.pol lexicon you need to add it. The code below uses sentiment_frame() to construct the new lexicon. Within the code sentiment_frame() accepts the original positive word vector, positive.words.negative.words are concatenated to “smh” and “kappa”, both considered negative slang. Although you can declare the positive and negative weights, the default is 1 and -1 so they are not included below.custom_pol <- sentiment_frame(positive.words, c(negative.words, "hate", "pain"))
Full example:
stressed_out <- dget("Datasets/stressed_out")
head(stressed_out)
## [1] "I wish I found some better sounds no ones ever heard\nI wish I had a better voice that sang some better words\nI wish I found some chords in an order that is new\nI wish I didnt have to rhyme every time I sang\nI was told when I get older all my fears would shrink\nBut now Im insecure and I care what people think\nMy names Blurryface and I care what you think\nMy names Blurryface and I care what you think\nWish we could turn back time, to the good old days\nWhen our momma sang us to sleep but now were stressed out\nWish we could turn back time to the good old days\nWhen our momma sang us to sleep but now were stressed out\nWere stressed out\nSometimes a certain smell will take me back to when I was young\nHow come Im never able to identify where its coming from\nId make a candle out of it if I ever found it\nTry to sell it never sell out of it Id probably only sell one\nItd be to my brother, cause we have the same nose\nSame clothes homegrown a stones throw from a creek we used to roam\nBut it would remind us of when nothing really mattered\nOut of student loans and tree-house homes we all would take the latter\nMy names Blurryface and I care what you think\nMy names Blurryface and I care what you think\nWish we could turn back time, to the good old days\nWhen our momma sang us to sleep but now were stressed out\nWish we could turn back time, to the good old days\nWhen our momma sang us to sleep but now were stressed out\nWe used to play pretend, give each other different names\nWe would build a rocket ship and then wed fly it far away\nUsed to dream of outer space but now theyre laughing at our face #\nSaying, Wake up you need to make money\nYeah\nWe used to play pretend give each other different names\nWe would build a rocket ship and then wed fly it far away\nUsed to dream of outer space but now theyre laughing at our face\nSaying, Wake up, you need to make money\nYeah\nWish we could turn back time, to the good old days\nWhen our momma sang us to sleep but now were stressed out\nWish we could turn back time, to the good old days\nWhen our momma sang us to sleep but now were stressed out\nUsed to play pretend, used to play pretend bunny\nWe used to play pretend wake up, you need the money\nUsed to play pretend used to play pretend bunny\nWe used to play pretend, wake up, you need the money\nWe used to play pretend give each other different names\nWe would build a rocket ship and then wed fly it far away\nUsed to dream of outer space but now theyre laughing at our face\nSaying, Wake up, you need to make money\nYeah"
# Basic lexicon score
polarity(stressed_out)
## all total.sentences total.words ave.polarity sd.polarity stan.mean.polarity
## 1 all 1 518 -0.255 NA NA
# Check the subjectivity lexicon
key.pol[grep("stress", x)]
## x y
## 1: distress -1
## 2: distressed -1
## 3: distressing -1
## 4: distressingly -1
## 5: mistress -1
## 6: stress -1
## 7: stresses -1
## 8: stressful -1
## 9: stressfully -1
# New lexicon
custom_pol <- sentiment_frame(positive.words, c(negative.words, "stressed", "turn back"))
# Compare new score
polarity(stressed_out, polarity.frame = custom_pol)
## all total.sentences total.words ave.polarity sd.polarity stan.mean.polarity
## 1 all 1 518 -0.826 NA NA
Uses 8 primary emotions, assuming other emotions are derived from them.
Kanjoya company specializing in emotion analysis. More complex framework, based on tracking users experience:
To change a DTM to a tidy format use tidy() from the broom package.
library(broom)
tidy_format <- tidy(Document_Term_Matrix)
# library(tidytext) is loaded
ag_dtm <- dget("Datasets/agamemnon")
ag_dtm_m <- as.matrix(ag_dtm)
# Examine line 2206 and columns 245:250
ag_dtm_m[2206, 245:250]
## bleed bleeds blent bless blessãd blessing
## 0 0 0 1 0 0
# Tidy up the DTM
ag_tidy <- tidy(ag_dtm)
# Examine tidy with a word you saw
ag_tidy[831:835, ]
## # A tibble: 5 x 3
## document term count
## <chr> <chr> <dbl>
## 1 234 bleeds 1
## 2 234 sleepeth 1
## 3 235 comes 1
## 4 235 will 1
## 5 235 wisdom 1
textdata lexicons:
afinn: score -5 / +5bing: positive / negativeloughran: positive / negativenrc: 8 emotions + positive / negative# library(textdata) is loaded
# AFINN
afinn_lex <- get_sentiments("afinn")
afinn_lex %>% count(value)
## # A tibble: 11 x 2
## value n
## <dbl> <int>
## 1 -5 16
## 2 -4 43
## 3 -3 264
## 4 -2 966
## 5 -1 309
## 6 0 1
## 7 1 208
## 8 2 448
## 9 3 172
## 10 4 45
## 11 5 5
# bing
bing_lex <- get_sentiments("bing")
bing_lex %>% count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
# loughran
loughran_lex <- get_sentiments("loughran")
loughran_lex %>% count(sentiment)
## # A tibble: 6 x 2
## sentiment n
## <chr> <int>
## 1 constraining 184
## 2 litigious 904
## 3 negative 2355
## 4 positive 354
## 5 superfluous 56
## 6 uncertainty 297
# nrc
nrc_lex <- get_sentiments("nrc")
nrc_lex %>% count(sentiment)
## # A tibble: 10 x 2
## sentiment n
## <chr> <int>
## 1 anger 1247
## 2 anticipation 839
## 3 disgust 1058
## 4 fear 1476
## 5 joy 689
## 6 negative 3324
## 7 positive 2312
## 8 sadness 1191
## 9 surprise 534
## 10 trust 1231
Examples of use:
inner_join: text_table, subjectivity_lexicon (sentiment of words that are present in lexicon)anti_join: text_table, stopwords_table (exclude stopwords)# first 100 lines from Agamemnon
ag_txt <- dget("Datasets/ag_txt")
# ag_tidy already loaded (above)
# Qdap polarity
polarity(ag_txt)
## all total.sentences total.words ave.polarity sd.polarity stan.mean.polarity
## 1 all 100 15157 -2.783 NA NA
# Get Bing lexicon
bing <- get_sentiments("bing")
# Join text to lexicon
ag_bing_words <- inner_join(ag_tidy, bing, by = c("term" = "word"))
# Examine
ag_bing_words
## # A tibble: 1,431 x 4
## document term count sentiment
## <chr> <chr> <dbl> <chr>
## 1 10 waste 1 negative
## 2 11 respite 1 positive
## 3 13 well 1 positive
## 4 14 lonely 1 negative
## 5 16 great 1 positive
## 6 16 heavenly 1 positive
## 7 22 dark 1 negative
## 8 23 fear 1 negative
## 9 24 warning 1 negative
## 10 25 well 1 positive
## # ... with 1,421 more rows
# Get counts by sentiment
ag_bing_words %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 906
## 2 positive 525
Change in polarity:
# From previous step
ag_polarity <- ag_tidy %>%
inner_join(bing, by = c("term" = "word")) %>%
mutate(index = as.numeric(document)) %>%
count(sentiment, index) %>%
spread(sentiment, n, fill = 0) %>%
mutate(polarity = positive - negative)
# Plot polarity vs. index
ggplot(ag_polarity, aes(index, polarity)) +
# Add a smooth trend curve
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
The AFINN lexicon has numeric values from 5 to -5, not just positive or negative. Unlike the Bing lexicon’s sentiment, the AFINN lexicon’s sentiment score column is called value.
# prepare data: Huck Finn
x <- readRDS("Datasets/all_books.rds")
huck <- x %>% filter(book == "huck_finn") %>%
rename(line = document) %>%
mutate(line = as.numeric(line)) %>%
select(term, count, line)
huck_afinn <- huck %>%
# Inner Join to AFINN lexicon
inner_join(afinn_lex, by = c("term" = "word")) %>%
# Count by value and line (e.g. n = 3 x -2 and n = 1 x 3 in line 5430)
count(value, line)
huck_afinn_agg <- huck_afinn %>%
# Group by line
group_by(line) %>%
# Sum values times n (by line)
summarise(total_value = sum(value * n))
## `summarise()` ungrouping output (override with `.groups` argument)
# Plot total_value vs. line
ggplot(huck_afinn_agg, aes(line, total_value)) +
# Add a smooth trend curve
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
huck_plutchik <- huck %>%
# Join to nrc lexicon by term = word
inner_join(nrc_lex, by = c("term" = "word")) %>%
# Only consider Plutchik sentiments
filter(!sentiment %in% c("positive", "negative")) %>%
# Group by sentiment
group_by(sentiment) %>%
# Get total count by sentiment
summarize(total_count = sum(count))
## `summarise()` ungrouping output (override with `.groups` argument)
huck_plutchik
## # A tibble: 8 x 2
## sentiment total_count
## <chr> <dbl>
## 1 anger 1137
## 2 anticipation 2242
## 3 disgust 840
## 4 fear 1352
## 5 joy 1744
## 6 sadness 1326
## 7 surprise 1169
## 8 trust 2229
Good visuals:
moby <- x %>%
filter(book == "moby_dick") %>%
select(term, document, count) %>%
rename(index = document)
moby_polarity <- moby %>%
# Inner join to the lexicon
inner_join(bing_lex, by = c("term" = "word")) %>%
# Count by sentiment, index
count(sentiment, index) %>%
# Spread sentiments
spread(sentiment, n, fill = 0) %>%
mutate(
# Add polarity field
polarity = positive - negative,
# Add line number field
line_number = row_number()
)
ggplot(moby_polarity, aes(line_number, polarity)) +
# Add a smooth trend curve
geom_smooth() +
# Add a horizontal line at y = 0
geom_hline(yintercept = 0, color = "red") +
# Add a plot title
ggtitle("Moby Dick Chronological Polarity") +
theme_gdocs()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Use reorder(term, polarity) to change the order of factor levels based upon another scoring variable. In this case, you will reorder the factor variable term by the scoring variable polarity.
moby_tidy_sentiment <- moby %>%
# Inner join to bing lexicon by term = word
inner_join(bing_lex, by = c("term" = "word")) %>%
# Count by term and sentiment, weighted by count
count(term, sentiment, wt = count) %>%
# Spread sentiment, using n as values
spread(sentiment, n, fill = 0) %>%
# Mutate to add a polarity column
mutate(polarity = positive - negative)
# Review
moby_tidy_sentiment
## # A tibble: 2,362 x 4
## term negative positive polarity
## <chr> <dbl> <dbl> <dbl>
## 1 abominable 3 0 -3
## 2 abominate 1 0 -1
## 3 abomination 1 0 -1
## 4 abound 0 3 3
## 5 abruptly 2 0 -2
## 6 absence 5 0 -5
## 7 absurd 3 0 -3
## 8 absurdly 1 0 -1
## 9 abundance 0 3 3
## 10 abundant 0 2 2
## # ... with 2,352 more rows
moby_tidy_pol <- moby_tidy_sentiment %>%
# Filter for absolute polarity at least 50
filter(abs(polarity) >= 50) %>%
# Add positive/negative status
mutate(
pos_or_neg = ifelse(polarity > 0, "positive", "negative")
)
# Review
moby_tidy_pol
## # A tibble: 21 x 5
## term negative positive polarity pos_or_neg
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 best 0 64 64 positive
## 2 better 0 62 62 positive
## 3 dark 55 0 -55 negative
## 4 dead 90 0 -90 negative
## 5 death 75 0 -75 negative
## 6 dick 77 0 -77 negative
## 7 enough 0 74 74 positive
## 8 fine 0 57 57 positive
## 9 good 0 195 195 positive
## 10 grand 0 52 52 positive
## # ... with 11 more rows
# Plot polarity vs. (term reordered by polarity), filled by pos_or_neg
ggplot(moby_tidy_pol,
aes(reorder(term, polarity), polarity, fill = pos_or_neg)) +
geom_col() +
ggtitle("Moby Dick: Sentiment Word Frequency") +
theme_gdocs() +
# Rotate text and vertically justify
theme(axis.text.x = element_text(angle = 90, vjust = -0.1))
vec <- c("A", "B", "C")
grep("C", vec) # returns index
## [1] 3
grepl("C", vec) # returns logical
## [1] FALSE FALSE TRUE
!grepl("C", vec) # returns inverted logical
## [1] TRUE TRUE FALSE
grepl("B|C", vec) # OR operator, logical outcome
## [1] FALSE TRUE TRUE
# oz_pol <- polarity(oz, grouping.var = NULL) # stupid DC does not provide fucking sources
oz_pol <- dget("Datasets/oz_pol")
oz_df <- oz_pol$all %>%
# Select text.var as text and polarity
select(text = text.var, polarity = polarity)
head(oz_df)
## text polarity
## 1 The Wonderful Wizard of Oz 0.4472136
## 2 NaN
## 3 NaN
## 4 by 0.0000000
## 5 NaN
## 6 L. Frank Baum 0.0000000
# function to divide the corpus by polarity score
pol_subsections <- function(df) {
x.pos <- subset(df$text, df$polarity > 0)
x.neg <- subset(df$text, df$polarity < 0)
x.pos <- paste(x.pos, collapse = " ")
x.neg <- paste(x.neg, collapse = " ")
all.terms <- c(x.pos, x.neg)
return(all.terms)
}
all_terms <- pol_subsections(oz_df)
all_corpus <- all_terms %>%
# Source from a vector
VectorSource() %>%
# Make a volatile corpus
VCorpus()
all_tdm <- TermDocumentMatrix(
# Create TDM from corpus
all_corpus,
control = list(
# Yes, remove the punctuation
removePunctuation = TRUE,
# Use English stopwords
stopwords = stopwords(kind = "en")
)
) %>%
# Convert to matrix
as.matrix() %>%
# Set column names
set_colnames(c("positive", "negative"))
comparison.cloud(
# Create plot from the all_tdm matrix
all_tdm,
# Limit to 50 words
max.words = 50,
# Use darkgreen and darkred colors
colors = c("darkgreen", "darkred")
)
## Emotional introspection
subset text by each of the 8 emotions in Plutchik’s emotional wheel to construct a visual
moby_tidy <- moby %>%
# Inner join to nrc lexicon
inner_join(nrc_lex, by = c("term" = "word")) %>%
# Drop positive or negative
filter(!grepl("positive|negative", sentiment)) %>%
# Count by sentiment and term
count(sentiment, term) %>%
# Spread sentiment, using n for values
spread(sentiment, n, fill = 0) %>%
# Convert to data.frame, making term the row names
data.frame(row.names = "term")
# Examine
head(moby_tidy)
## anger anticipation disgust fear joy sadness surprise trust
## abandon 0 0 0 3 0 3 0 0
## abandoned 7 0 0 7 0 7 0 0
## abandonment 2 0 0 2 0 2 2 0
## abhorrent 1 0 1 1 0 0 0 0
## abominable 0 0 3 3 0 0 0 0
## abomination 1 0 1 1 0 0 0 0
comparison.cloud(moby_tidy, max.words = 50, title.size = 1.5)
Another way to slice your text is to understand how much of the document(s) are made of positive or negative words. For example a restaurant review may have some positive aspects such as “the food was good” but then continue to add “the restaurant was dirty, the staff was rude and parking was awful.” As a result, you may want to understand how much of a document is dedicated to positive vs negative language. In this example it would have a higher negative percentage compared to positive.
One method for doing so is to count() the positive and negative words then divide by the number of subjectivity words identified. In the restaurant review example, “good” would count as 1 positive and “dirty,” “rude,” and “awful” count as 3 negative terms. A simple calculation would lead you to believe the restaurant review is 25% positive and 75% negative since there were 4 subjectivity terms.
all_books <- readRDS("Datasets/all_books.rds")
# Review tail of all_books
tail(all_books)
## # A tibble: 6 x 5
## term document count author book
## <chr> <chr> <dbl> <chr> <chr>
## 1 ebooks 19117 1 twain innocents_abroad
## 2 email 19117 1 twain innocents_abroad
## 3 hear 19117 1 twain innocents_abroad
## 4 new 19117 1 twain innocents_abroad
## 5 newsletter 19117 1 twain innocents_abroad
## 6 subscribe 19117 1 twain innocents_abroad
# Count by book & sentiment
books_sent_count <- all_books %>%
# Inner join to nrc lexicon
inner_join(nrc_lex, by = c("term" = "word")) %>%
# Keep only positive or negative
filter(grepl("positive|negative", sentiment)) %>%
# Count by book and by sentiment
count(book, sentiment)
# Review entire object
books_sent_count
## # A tibble: 22 x 3
## book sentiment n
## <chr> <chr> <int>
## 1 bartleby negative 537
## 2 bartleby positive 864
## 3 confidence_man negative 3561
## 4 confidence_man positive 5899
## 5 ct_yankee negative 4048
## 6 ct_yankee positive 6154
## 7 hamlet negative 1677
## 8 hamlet positive 2250
## 9 huck_finn negative 2471
## 10 huck_finn positive 3544
## # ... with 12 more rows
book_pos <- books_sent_count %>%
# Group by book
group_by(book) %>%
# Mutate to add % positive column
mutate(percent_positive = 100 * n / sum(n))
# Plot percent_positive vs. book, filled by sentiment
ggplot(book_pos, aes(percent_positive, book, fill = sentiment)) +
# Add a col layer
geom_col()
oz <- dget("Datasets/oz")
ag <- dget("Datasets/ag")
ag_afinn <- ag %>%
# Inner join to afinn lexicon
inner_join(afinn_lex, by = c("term" = "word"))
oz_afinn <- oz %>%
# Inner join to afinn lexicon
inner_join(afinn_lex, by = c("term" = "word"))
# Combine
all_df <- bind_rows(agamemnon = ag_afinn, oz = oz_afinn, .id = "book")
# Plot value, filled by book
ggplot(all_df, aes(value, fill = book)) +
# Set transparency to 0.3
geom_density(alpha = 0.3) +
theme_gdocs() +
ggtitle("AFINN Score Densities")
Easier way to compare multiple distributions (hard with kernel density plots).
This exercise introduces tapply() which allows you to apply functions over a ragged array. You input a vector of values and then a vector of factors. For each factor, value combination the third parameter, a function like min(), is applied. For example here’s some code with tapply() used on two vectors. The result is an array where Group1 has a value of 2 (1+1) and Group2 has a value of 4 (2+2).
f1 <- as.factor(c("Group1", "Group2", "Group1", "Group2"))
stat1 <- c(1, 2, 1, 2)
tapply(stat1, f1, sum)
## Group1 Group2
## 2 4
all_book_polarity <- dget("Datasets/all_book_polarity")
# Summary by document
tapply(all_book_polarity$polarity, all_book_polarity$book, summary)
## $huck
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.38695 -0.25820 0.23570 0.04156 0.26726 1.60357
##
## $agamemnon
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.4667 -0.3780 -0.3333 -0.1266 0.3333 1.2247
##
## $moby
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.13333 -0.28868 -0.25000 -0.02524 0.28868 1.84752
##
## $oz
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.2728 -0.2774 0.2582 0.0454 0.2887 1.1877
# Box plot
ggplot(all_book_polarity, aes(x = book, y = polarity)) +
geom_boxplot(fill = c("#bada55", "#F00B42", "#F001ED", "#BA6E15"), col = "darkred") +
geom_jitter(position = position_jitter(width = 0.1, height = 0), alpha = 0.02) +
theme_gdocs() +
ggtitle("Book Polarity")
Compare multiple values
moby_huck <- dget("Datasets/moby_huck")
scores <- moby_huck %>%
# Inner join to lexicon
inner_join(nrc_lex, by = c("term" = "word")) %>%
# Drop positive or negative sentiments
filter(!grepl("positive|negative", sentiment)) %>%
# Count by book and sentiment
count(book, sentiment) %>%
# Spread book, using n as values
spread(book, n)
# Review scores
scores
## # A tibble: 8 x 3
## sentiment huck moby
## <chr> <int> <int>
## 1 anger 39 91
## 2 anticipation 74 138
## 3 disgust 26 70
## 4 fear 48 121
## 5 joy 56 109
## 6 sadness 47 112
## 7 surprise 41 67
## 8 trust 63 147
# JavaScript radar chart
chartJSRadar(scores)
treemap(
data_frame,
index = c("group", "individual_document"),
vSize = "doc_length",
vColor = "avg_score",
type = "value",
title = "Sentiment Scores by Doc",
palette = c("red", "white", "green")
)
book_length <- all_books %>%
# Count number of words per book
count(book)
# Examine the results
book_length
## # A tibble: 11 x 2
## book n
## <chr> <int>
## 1 bartleby 8871
## 2 confidence_man 48834
## 3 ct_yankee 58229
## 4 hamlet 18725
## 5 huck_finn 55198
## 6 innocents_abroad 99031
## 7 julius_caesar 13165
## 8 macbeth 12240
## 9 moby_dick 109996
## 10 romeo_juliet 16870
## 11 tom_sawyer 38831
book_tree <- all_books %>%
# Inner join to afinn lexicon
inner_join(afinn_lex, by = c("term" = "word")) %>%
# Group by author, book
group_by(author, book) %>%
# Calculate mean book value
summarize(mean_value = mean(value)) %>%
# Inner join by book
inner_join(book_length, by = "book")
## `summarise()` regrouping output by 'author' (override with `.groups` argument)
# Examine the results
book_tree
## # A tibble: 11 x 4
## # Groups: author [3]
## author book mean_value n
## <chr> <chr> <dbl> <int>
## 1 melville bartleby 0.101 8871
## 2 melville confidence_man 0.506 48834
## 3 melville moby_dick 0.161 109996
## 4 shakespeare hamlet 0.0984 18725
## 5 shakespeare julius_caesar 0.0846 13165
## 6 shakespeare macbeth 0.222 12240
## 7 shakespeare romeo_juliet 0.175 16870
## 8 twain ct_yankee 0.199 58229
## 9 twain huck_finn 0.0763 55198
## 10 twain innocents_abroad 0.405 99031
## 11 twain tom_sawyer -0.0265 38831
treemap(
# Use the book tree
book_tree,
# Index by author and book
index = c("author", "book"),
# Use n as vertex size
vSize = "n",
# Color vertices by mean_value
vColor = "mean_value",
# Draw a value type
type = "value",
title = "Book Sentiment Scores",
palette = c("red", "white", "green")
)
## Warning in if (class(try(col2rgb(palette), silent = TRUE)) == "try-error")
## stop("color palette is not correct"): the condition has length > 1 and only the
## first element will be used
bos_reviews <- readRDS("Datasets/bos_reviews.rds")
str(bos_reviews)
## 'data.frame': 1000 obs. of 2 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ comments: chr "My daughter and I had a wonderful stay with Maura. She kept in close touch with us throughout the day as we wer"| __truncated__ "We stay at Elizabeth's place for 3 nights in October 2014.\nThe apartment is really a great place to stay. \nLo"| __truncated__ "If you're staying in South Boston, this is a terrific place to camp out. The apartment and bedroom are lovely, "| __truncated__ "Derian and Brian were great and prompt with their communications with us. The room was as described; it was a s"| __truncated__ ...
dim(bos_reviews)
## [1] 1000 2
# Practice apply polarity to first 6 reviews (to save CPU)
practice_pol <- polarity(bos_reviews$comments[1:6])
## Warning in polarity(bos_reviews$comments[1:6]):
## Some rows contain double punctuation. Suggested use of `sentSplit` function.
bos_pol <- readRDS("Datasets/bos_pol.rds") # polarity for all 1000 reviews
# Summary for all reviews
summary(bos_pol$all$polarity)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -0.9712 0.6047 0.8921 0.9022 1.2063 3.7510 1
# Plot Boston polarity all element
ggplot(bos_pol$all, aes(x = polarity, y = ..density..)) +
geom_histogram(binwidth = 0.25, fill = "#bada55", colour = "grey60") +
geom_density(size = 0.75) +
theme_gdocs()
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing non-finite values (stat_density).
pos_terms <- bos_reviews %>%
# Add polarity column
mutate(polarity = bos_pol$all$polarity) %>%
# Filter for positive polarity
filter(polarity > 0) %>%
# Extract comments column
pull(comments) %>%
# Paste and collapse
paste(collapse = " ")
neg_terms <- bos_reviews %>%
mutate(polarity = bos_pol$all$polarity) %>%
filter(polarity < 0) %>%
pull(comments) %>%
paste(collapse = " ")
# Concatenate the terms
all_corpus <- c(pos_terms, neg_terms) %>%
# Source from a vector
VectorSource() %>%
# Create a volatile corpus
VCorpus()
all_tdm <- TermDocumentMatrix(
# Use all_corpus
all_corpus,
control = list(
# Use TFIDF weighting
weighting = weightTfIdf,
# Remove the punctuation
removePunctuation = TRUE,
# Use English stopwords
stopwords = stopwords(kind = "en")
)
)
# Examine the TDM
all_tdm
## <<TermDocumentMatrix (terms: 4965, documents: 2)>>
## Non-/sparse entries: 4348/5582
## Sparsity : 56%
## Maximal term length: 93
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
First you use unnest_tokens() to make the text lowercase and tokenize the reviews into single words.
tidy() on a TermDocumentMatrix() object will convert the TDM to a tibble
Sometimes it is useful to capture the original word order within each group of a corpus. To do so, use mutate(). In mutate() you will use seq_along() to create a sequence of numbers from 1 to the length of the object. This will capture the word order as it was written.
# Vector to tibble
tidy_reviews <- bos_reviews %>%
unnest_tokens(word, comments)
head(tidy_reviews, 20)
## id word
## 1 1 my
## 1.1 1 daughter
## 1.2 1 and
## 1.3 1 i
## 1.4 1 had
## 1.5 1 a
## 1.6 1 wonderful
## 1.7 1 stay
## 1.8 1 with
## 1.9 1 maura
## 1.10 1 she
## 1.11 1 kept
## 1.12 1 in
## 1.13 1 close
## 1.14 1 touch
## 1.15 1 with
## 1.16 1 us
## 1.17 1 throughout
## 1.18 1 the
## 1.19 1 day
# Group by and mutate
tidy_reviews <- tidy_reviews %>%
group_by(id) %>%
mutate(original_word_order = seq_along(word))
head(tidy_reviews, 20)
## # A tibble: 20 x 3
## # Groups: id [1]
## id word original_word_order
## <int> <chr> <int>
## 1 1 my 1
## 2 1 daughter 2
## 3 1 and 3
## 4 1 i 4
## 5 1 had 5
## 6 1 a 6
## 7 1 wonderful 7
## 8 1 stay 8
## 9 1 with 9
## 10 1 maura 10
## 11 1 she 11
## 12 1 kept 12
## 13 1 in 13
## 14 1 close 14
## 15 1 touch 15
## 16 1 with 16
## 17 1 us 17
## 18 1 throughout 18
## 19 1 the 19
## 20 1 day 20
# Load stopwords
data("stop_words")
# Perform anti-join
tidy_reviews_without_stopwords <- tidy_reviews %>%
anti_join(stop_words)
## Joining, by = "word"
# Get the correct lexicon
bing <- get_sentiments("bing")
# Calculate polarity for each review
pos_neg <- tidy_reviews %>%
inner_join(bing) %>%
count(sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(polarity = positive - negative)
## Joining, by = "word"
# Check outcome
summary(pos_neg)
## id negative positive polarity
## Min. : 1 Min. : 0.0000 Min. : 0.000 Min. :-10.000
## 1st Qu.: 251 1st Qu.: 0.0000 1st Qu.: 4.000 1st Qu.: 3.000
## Median : 499 Median : 0.0000 Median : 6.000 Median : 5.000
## Mean : 500 Mean : 0.6633 Mean : 6.569 Mean : 5.906
## 3rd Qu.: 748 3rd Qu.: 1.0000 3rd Qu.: 8.000 3rd Qu.: 8.000
## Max. :1000 Max. :14.0000 Max. :42.000 Max. : 37.000
This exercise will create a common visual for you to understand term frequency. Specifically, you will review the most frequent terms from among the positive and negative collapsed documents.
# Matrix
all_tdm_m <- as.matrix(all_tdm)
# Column names
colnames(all_tdm_m) <- c("positive", "negative")
# Top pos words
order_by_pos <- order(all_tdm_m[, 1], decreasing = TRUE)
# Review top 10 pos words
all_tdm_m[order_by_pos, ] %>% head(10)
## Docs
## Terms positive negative
## walk 0.004558911 0
## definitely 0.004174067 0
## staying 0.003730018 0
## city 0.003285968 0
## wonderful 0.003108348 0
## restaurants 0.003049142 0
## highly 0.002960332 0
## station 0.002693902 0
## enjoyed 0.002427472 0
## subway 0.002397869 0
# Top neg words
order_by_neg <- order(all_tdm_m[, 2], decreasing = TRUE)
# Review top 10 neg words
all_tdm_m[order_by_neg, ] %>% head(10)
## Docs
## Terms positive negative
## condition 0 0.002159827
## don´t 0 0.002159827
## demand 0 0.001439885
## disappointed 0 0.001439885
## dumpsters 0 0.001439885
## hygiene 0 0.001439885
## inform 0 0.001439885
## it´s 0 0.001439885
## nasty 0 0.001439885
## safety 0 0.001439885
comparison.cloud(
# Use the term-document matrix
all_tdm_m,
# Limit to 20 words
max.words = 20,
colors = c("darkgreen","darkred")
)
Sometimes, another way to uncover an insight is to scale the scores back to 0 then perform the corpus subset. This means some of the previously positive comments may become part of the negative subsection or vice versa since the mean is changed to 0.
# Review
bos_pol$all[1:6,1:3]
## all wc polarity
## 1 all 77 1.1851900
## 2 all 78 1.2455047
## 3 all 39 0.4803845
## 4 all 101 0.7562283
## 5 all 16 0.2500000
## 6 all 79 0.5625440
# Scale/center & append
bos_reviews$scaled_polarity <- scale(bos_pol$all$polarity)
# Subset positive comments
pos_comments <- subset(bos_reviews$comments, bos_reviews$scaled_polarity > 0)
# Subset negative comments
neg_comments <- subset(bos_reviews$comments, bos_reviews$scaled_polarity < 0)
# Paste and collapse the positive comments
pos_terms <- paste(pos_comments, collapse = " ")
# Paste and collapse the negative comments
neg_terms <- paste(neg_comments, collapse = " ")
# Organize
all_terms<- c(pos_terms, neg_terms)
# VCorpus
all_corpus <- VCorpus(VectorSource(all_terms))
# TDM
all_tdm <- TermDocumentMatrix(
all_corpus,
control = list(
weighting = weightTfIdf,
removePunctuation = TRUE,
stopwords = stopwords(kind = "en")
)
)
# Column names
all_tdm_m <- as.matrix(all_tdm)
colnames(all_tdm_m) <- c("positive", "negative")
# Comparison cloud
comparison.cloud(
all_tdm_m,
max.words = 100,
colors = c("darkgreen", "darkred")
)