library(topicmodels)
library(rvest)
library(tm)
library(SnowballC)
library(wordcloud)
library(RColorBrewer)
library(ggplot2)
library(tidyverse)
library(janitor)
library(maps)
library(ggthemes)
library(tidytext)
library(textdata)
For this section, I'll be using the janeausten package and replicating the example from the textbook:
library(janeaustenr)
library(dplyr)
library(stringr)
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
head(tidy_books, 5) %>% knitr::kable()
| book | linenumber | chapter | word |
|---|---|---|---|
| Sense & Sensibility | 1 | 0 | sense |
| Sense & Sensibility | 1 | 0 | and |
| Sense & Sensibility | 1 | 0 | sensibility |
| Sense & Sensibility | 3 | 0 | by |
| Sense & Sensibility | 3 | 0 | jane |
Essentially, the book is tokenized here, providing us with every single word of the book. The next step will be joining our data with the nrc lexicon:
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## # A tibble: 301 x 2
## word n
## <chr> <int>
## 1 good 359
## 2 friend 166
## 3 hope 143
## 4 happy 125
## 5 love 117
## 6 deal 92
## 7 found 92
## 8 present 89
## 9 kind 82
## 10 happiness 76
## # … with 291 more rows
Here we have a tabulation of terms (not verbatim). From here, we can tabulate the sentiments and visualize accordingly:
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
The primary code also does a lexicon comparison:
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c(
"positive",
"negative"
))) %>%
mutate(method = "NRC")
) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
bind_rows(
afinn,
bing_and_nrc
) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y") +
labs(title = "Sentiment Analysis of Pride and Prejudice")
Next, we'll visualize using a wordcloud:
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 80))
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
reshape2::acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(
colors = c("green", "purple"),
max.words = 80
)
The last step is aggregating sentences and attributing respective sentiments. We can see the ratio of negative to positive words per section:
p_and_p_sentences <- tibble(text = prideprejudice) %>%
unnest_tokens(sentence, text, token = "sentences")
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text, token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
slice_max(ratio, n = 1) %>%
ungroup() %>%
knitr::kable()
| book | chapter | negativewords | words | ratio |
|---|---|---|---|---|
| Sense & Sensibility | 43 | 161 | 3405 | 0.0472834 |
| Pride & Prejudice | 34 | 111 | 2104 | 0.0527567 |
| Mansfield Park | 46 | 173 | 3685 | 0.0469471 |
| Emma | 15 | 151 | 3340 | 0.0452096 |
| Northanger Abbey | 21 | 149 | 2982 | 0.0499665 |
| Persuasion | 4 | 62 | 1807 | 0.0343110 |
Silge, Julia, and David Robinson, Text Mining with R: A Tidy Approach, O’Reilly Media, 2020. https://www.tidytextmining.com/
For this assignment, I'll be using tweets from the hashtag #whyIdidntreport. This hashtag trended around the confirmation of Justice Kavannaugh to the Supreme Court. I scraped the data and will be performing a sentiment analysis using the nrc lexicon. This will provide more details on the terms used.
webpage <- read_html("tweet.html")
table <- html_table(webpage, fill = TRUE, header=TRUE)
# colnames(table) = table[[1]]
data <-table[[1]]
colnames(data) = make_clean_names(colnames(data))
head(data, 2) %>% knitr::kable() %>% kableExtra::kable_styling()
| tweet_id | username | tweet | time | tweet_type | retweeted_by | number_of_retweets | hashtags | mentions | name | location | web | bio | number_of_tweets | number_of_followers | number_following | coordinates |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1.043256e+18 | MuellerSheWrote | RT @MuellerSheWrote: I was drugged and raped in the Navy. The MP told me if I filed a report I would be charged with adultery because my ra& | 2018-09-21T21:50:09.000Z | Retweet | cliffordbeshers | 63 | " | Mueller, She Wrote Podcast | San Diego, CA | https://t.co/1JxEsjryyU | Three women unraveling the Trump-Russia investigation. Alt Fed Employee. Vet. PhD. @jordansconfused @tweetjaleesa #resist #mueller #VetsResistSquadron #FBR | 9825 | 19958 | 6171 | +32.71533-117.15726/ | |
| 1.043256e+18 | Kendragarden | RT @Kendragarden: Because even typing this tweet decades later is giving me a panic attack. #WhyIDidntReport | 2018-09-21T21:50:09.000Z | Retweet | viking3rdwave | 36 | WhyIDidntReport | Kendra Alvey | Los Angeles. | https://t.co/I4ZS3Y5mXC | Author of The Haunter (coming 10/2), the nicest Slytherin. https://t.co/hWuNbvfMKj | 6228 | 26274 | 927 | +34.05223-118.24368/ |
There's quite a bit of information here, so we'll restrict this to the tweets
tweets <- subset(data, tweet_type == "Tweet")
We'll start by previewing some of the tweets
tweet_text <- Corpus(VectorSource(tweets$tweet))
(inspect(tweet_text[1:3]))
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 3
##
## [1] @SenDuckworth I did report, called 911 on 2 different occasions in 2 different cities. Police left me with my attacker/stalker both times bc he is a rich white man. 911 dispatcher called me 2 weeks later to check on me, couldn't understand why there was no police report!"\r#WhyIDidntReport
## [2] #WhyIDidntReport\r#WhyIFearMyDaughterWouldntReport = Even our POTUS exploits victims! And other politicians allow it by remaining silent. \t\t\t\t\r#WhoWillProtectHer
## [3] #WhyIDidntReport i thought he was my friend. He blamed it on me, my body language. I told him no, he said my body was telling him something different. I called the police, but they failed to do anything about it #JabariStegall
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 3
From here, we'll clean the text of spaces and unnecessary characters
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
tweet_text2 <- tm_map(tweet_text, toSpace, "/")
tweet_text2 <- tm_map(tweet_text, toSpace, "@")
tweet_text2 <- tm_map(tweet_text, toSpace, "\\|")
Here, we'll transform the tweets into corpus objects
## Convert the text to lower case
docs <- tm_map(tweet_text2, content_transformer(tolower))
## Remove numbers
tweet_text2 <- tm_map(tweet_text2, removeNumbers)
## Remove english common stopwords
tweet_text2 <- tm_map(tweet_text2, removeWords, stopwords("english"))
## specify your stopwords as a character vector
tweet_text2 <- tm_map(tweet_text2, removeWords, c("I", "my", "because"))
## Remove punctuations
tweet_text2 <- tm_map(tweet_text2, removePunctuation)
## Eliminate extra white spaces
tweet_text2 <- tm_map(tweet_text2, stripWhitespace)
We can also separate the tweets by topic:
dtm <- DocumentTermMatrix(tweet_text2)
dtm <- removeSparseTerms(dtm, 0.99)
ldaOut <-LDA(dtm, k = 4)
topics <-terms(ldaOut,6)
head(topics, 2) %>% knitr::kable()
| Topic 1 | Topic 2 | Topic 3 | Topic 4 |
|---|---|---|---|
| whyididntreport | whyididntreport | whyididntreport | whyididntreport |
| believe | told | realdonaldtrump | because |
The next step is a document matrix which sorts words by frequency
tweets_matrix <- TermDocumentMatrix(tweet_text2)
m <- as.matrix(tweets_matrix)
v <- sort(rowSums(m),decreasing=TRUE)
doc_matrix <- data.frame(word = names(v),freq=v)
Here we'll plot the word frequencies:
graph = ggplot(data = doc_matrix[8:30,], aes(reorder(word, -freq), freq)) + geom_bar(stat = "identity", fill = "lightblue", color = "black") + labs( title ="Most frequent words", x = "Word", y = "Word frequencies") + theme_minimal() + theme(axis.text.x=element_text(angle=90))
plot(graph)
We can preview term frequency using a wordcloud:
wordcloud <- wordcloud(words = setdiff(doc_matrix$word, c("whyididntreport", "because")), freq = doc_matrix$freq, min.freq = 1,
max.words=50, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
Here we'll use the nrc lexicon and visualize the top sentiments:
nrc_sent <- get_sentiments("nrc")
# bing_sent <-get_sentiments("bing")
#affin_sent <-get_sentiments("afinn")
tweet_nrc_sent <- doc_matrix %>%
inner_join(nrc_sent, by=c("word"))
#tweet_bing_sent <- doc_matrix %>% inner_join(bing_sent, by=c("word"))
# tweet_affin_sent <- doc_matrix %>% inner_join(affin_sent, by=c("word"))
tweet_nrc_sum <- tweet_nrc_sent %>%
select(sentiment, freq) %>%
group_by(sentiment) %>%
rename (count = freq) %>%
summarise(n = sum(count)) %>% arrange(n) %>% filter(sentiment != "negative" & sentiment != "positive")
tweet_nrc_sum %>% ggplot(aes(reorder(sentiment, -n), n, fill = n)) +
geom_col() +
xlab(NULL) +
coord_flip() + theme_clean()
nrc was probably the best lexicon. It didn't set sentiments as a binary and included much needed nuance, especially given the sensitivity of the subject.