library(topicmodels)
library(rvest)
library(tm)
library(SnowballC)
library(wordcloud)
library(RColorBrewer)
library(ggplot2)
library(tidyverse)
library(janitor)
library(maps)
library(ggthemes)
library(tidytext)
library(textdata)

Part I: Textbook Assignment

  • Re-create and analyze primary code from the textbook.
  • Provide citation to text book, using a standard citation syntax like APA or MLA.

For this section, I'll be using the janeausten package and replicating the example from the textbook:

library(janeaustenr)
library(dplyr)
library(stringr)

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

head(tidy_books, 5) %>% knitr::kable()
book linenumber chapter word
Sense & Sensibility 1 0 sense
Sense & Sensibility 1 0 and
Sense & Sensibility 1 0 sensibility
Sense & Sensibility 3 0 by
Sense & Sensibility 3 0 jane

Essentially, the book is tokenized here, providing us with every single word of the book. The next step will be joining our data with the nrc lexicon:

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## # A tibble: 301 x 2
##    word          n
##    <chr>     <int>
##  1 good        359
##  2 friend      166
##  3 hope        143
##  4 happy       125
##  5 love        117
##  6 deal         92
##  7 found        92
##  8 present      89
##  9 kind         82
## 10 happiness    76
## # … with 291 more rows

Here we have a tabulation of terms (not verbatim). From here, we can tabulate the sentiments and visualize accordingly:

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

The primary code also does a lexicon comparison:

pride_prejudice <- tidy_books %>%
  filter(book == "Pride & Prejudice")

afinn <- pride_prejudice %>%
  inner_join(get_sentiments("afinn")) %>%
  group_by(index = linenumber %/% 80) %>%
  summarise(sentiment = sum(value)) %>%
  mutate(method = "AFINN")

bing_and_nrc <- bind_rows(
  pride_prejudice %>%
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>%
    inner_join(get_sentiments("nrc") %>%
      filter(sentiment %in% c(
        "positive",
        "negative"
      ))) %>%
    mutate(method = "NRC")
) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

bind_rows(
  afinn,
  bing_and_nrc
) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y") +
  labs(title = "Sentiment Analysis of Pride and Prejudice")

Next, we'll visualize using a wordcloud:

tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 80))

tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  reshape2::acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(
    colors = c("green", "purple"),
    max.words = 80
  )

The last step is aggregating sentences and attributing respective sentiments. We can see the ratio of negative to positive words per section:

p_and_p_sentences <- tibble(text = prideprejudice) %>% 
  unnest_tokens(sentence, text, token = "sentences")
austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())

tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  slice_max(ratio, n = 1) %>% 
  ungroup() %>% 
  knitr::kable()
book chapter negativewords words ratio
Sense & Sensibility 43 161 3405 0.0472834
Pride & Prejudice 34 111 2104 0.0527567
Mansfield Park 46 173 3685 0.0469471
Emma 15 151 3340 0.0452096
Northanger Abbey 21 149 2982 0.0499665
Persuasion 4 62 1807 0.0343110

References

Silge, Julia, and David Robinson, Text Mining with R: A Tidy Approach, O’Reilly Media, 2020. https://www.tidytextmining.com/

Part II: Original Example

For this assignment, I'll be using tweets from the hashtag #whyIdidntreport. This hashtag trended around the confirmation of Justice Kavannaugh to the Supreme Court. I scraped the data and will be performing a sentiment analysis using the nrc lexicon. This will provide more details on the terms used.

Importing Tweets into a DataFrame

webpage <- read_html("tweet.html")
table <- html_table(webpage, fill = TRUE, header=TRUE)
# colnames(table) =  table[[1]]

data <-table[[1]]

colnames(data) = make_clean_names(colnames(data))
head(data, 2) %>% knitr::kable() %>% kableExtra::kable_styling()
tweet_id username tweet time tweet_type retweeted_by number_of_retweets hashtags mentions name location web bio number_of_tweets number_of_followers number_following coordinates
1.043256e+18 MuellerSheWrote RT @MuellerSheWrote: I was drugged and raped in the Navy. The MP told me if I filed a report I would be charged with adultery because my ra& 2018-09-21T21:50:09.000Z Retweet cliffordbeshers 63 " Mueller, She Wrote Podcast San Diego, CA https://t.co/1JxEsjryyU Three women unraveling the Trump-Russia investigation. Alt Fed Employee. Vet. PhD. @jordansconfused @tweetjaleesa #resist #mueller #VetsResistSquadron #FBR 9825 19958 6171 +32.71533-117.15726/
1.043256e+18 Kendragarden RT @Kendragarden: Because even typing this tweet decades later is giving me a panic attack. #WhyIDidntReport 2018-09-21T21:50:09.000Z Retweet viking3rdwave 36 WhyIDidntReport Kendra Alvey Los Angeles. https://t.co/I4ZS3Y5mXC Author of The Haunter (coming 10/2), the nicest Slytherin. https://t.co/hWuNbvfMKj 6228 26274 927 +34.05223-118.24368/

Subset to Original Tweets

There's quite a bit of information here, so we'll restrict this to the tweets

tweets <- subset(data, tweet_type == "Tweet")

Text mining of tweets

We'll start by previewing some of the tweets

tweet_text <- Corpus(VectorSource(tweets$tweet))
(inspect(tweet_text[1:3]))
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 3
## 
## [1] @SenDuckworth I did report, called 911 on 2 different occasions in 2 different cities. Police left me with my attacker/stalker both times bc he is a rich white man.  911 dispatcher called me 2 weeks later to check on me, couldn't understand why there was no police report!"\r#WhyIDidntReport
## [2] #WhyIDidntReport\r#WhyIFearMyDaughterWouldntReport = Even our POTUS  exploits victims! And other politicians allow it by remaining silent. \t\t\t\t\r#WhoWillProtectHer                                                                                                                            
## [3] #WhyIDidntReport i thought he was my friend. He blamed it on me, my body language. I told him no, he said my body was telling him something different. I called the police, but they failed to do anything about it #JabariStegall
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 3

Removing special characters

From here, we'll clean the text of spaces and unnecessary characters

toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
tweet_text2 <- tm_map(tweet_text, toSpace, "/")
tweet_text2 <- tm_map(tweet_text, toSpace, "@")
tweet_text2 <- tm_map(tweet_text, toSpace, "\\|")

Text Cleaning

Here, we'll transform the tweets into corpus objects

## Convert the text to lower case
docs <- tm_map(tweet_text2, content_transformer(tolower))
## Remove numbers
tweet_text2 <- tm_map(tweet_text2, removeNumbers)
## Remove english common stopwords
tweet_text2 <- tm_map(tweet_text2, removeWords, stopwords("english"))
## specify your stopwords as a character vector
tweet_text2 <- tm_map(tweet_text2, removeWords, c("I", "my", "because")) 
## Remove punctuations
tweet_text2 <- tm_map(tweet_text2, removePunctuation)
## Eliminate extra white spaces
tweet_text2 <- tm_map(tweet_text2, stripWhitespace)

Topic Analysis

We can also separate the tweets by topic:

dtm <- DocumentTermMatrix(tweet_text2)
dtm <- removeSparseTerms(dtm, 0.99)
ldaOut <-LDA(dtm, k = 4)
topics <-terms(ldaOut,6)
head(topics, 2) %>% knitr::kable()
Topic 1 Topic 2 Topic 3 Topic 4
whyididntreport whyididntreport whyididntreport whyididntreport
believe told realdonaldtrump because

Document Matrix

The next step is a document matrix which sorts words by frequency

tweets_matrix <- TermDocumentMatrix(tweet_text2)
m <- as.matrix(tweets_matrix)
v <- sort(rowSums(m),decreasing=TRUE)
doc_matrix <- data.frame(word = names(v),freq=v)

Building the Word Chart

Here we'll plot the word frequencies:

graph = ggplot(data = doc_matrix[8:30,], aes(reorder(word, -freq), freq)) + geom_bar(stat = "identity", fill = "lightblue", color = "black") + labs( title ="Most frequent words", x = "Word", y = "Word frequencies") + theme_minimal() + theme(axis.text.x=element_text(angle=90))
plot(graph)

Building the Word Cloud

We can preview term frequency using a wordcloud:

wordcloud <- wordcloud(words = setdiff(doc_matrix$word, c("whyididntreport", "because")), freq = doc_matrix$freq, min.freq = 1,
          max.words=50, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

Sentiment Analysis

Here we'll use the nrc lexicon and visualize the top sentiments:

nrc_sent <- get_sentiments("nrc")
# bing_sent <-get_sentiments("bing")
#affin_sent <-get_sentiments("afinn")

tweet_nrc_sent <- doc_matrix %>%
  inner_join(nrc_sent, by=c("word")) 

#tweet_bing_sent <- doc_matrix %>% inner_join(bing_sent, by=c("word"))

# tweet_affin_sent <- doc_matrix %>% inner_join(affin_sent, by=c("word"))

tweet_nrc_sum <- tweet_nrc_sent %>%
  select(sentiment, freq) %>% 
  group_by(sentiment) %>%
  rename (count = freq) %>%
  summarise(n = sum(count)) %>% arrange(n) %>% filter(sentiment != "negative" & sentiment != "positive")

 tweet_nrc_sum %>% ggplot(aes(reorder(sentiment, -n), n, fill = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() + theme_clean()

Conclusion

nrc was probably the best lexicon. It didn't set sentiments as a binary and included much needed nuance, especially given the sensitivity of the subject.