Major 4

Load libraries

library(tmap)
library(RedditExtractoR)
library(anytime)
library(magrittr)
library(ggplot2)
library(dplyr)
library(tidytext)
library(tidyverse)
library(igraph)
library(wordcloud2)
library(textdata)
library(sf)
library(stringi)
library(tidytext)
library(textdata)
library(ggdark)
library(gofastr)
library(syuzhet)
library(sentimentr)
library(lubridate)
library(here)
library(ggraph)

This analysis is looking at the mentions of ‘rats’ in the Chicago subreddit.

Get reddit threads using keywords

# using keyword
rats_chi <- find_thread_urls(keywords = 'rats', 
                              sort_by = 'top', 
                              period = 'all',
                             subreddit = 'chicago')

## parsing URLs on page 1...
## parsing URLs on page 2...

colnames(rats_chi)

## [1] "date_utc"  "timestamp" "title"     "text"      "subreddit" "comments" 
## [7] "url"

head(rats_chi)

Tokenize

# Tokenization (word tokens)
words <- rats_chi  %>% 
  unnest_tokens(output = word, input = text, token = "words")

words %>%
  count(word, sort = TRUE) %>%
  top_n(20) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "words",
       y = "counts",
       title = "Unique wordcounts")

## Selecting by n

We will remove the stop words using a common stop words data set provided by tidytext package.

# load list of stop words - from the tidytext package
data("stop_words")
# view random 50 words
print(stop_words$word[sample(1:nrow(stop_words), 50)])

##  [1] "number"     "inasmuch"   "but"        "near"       "against"   
##  [6] "seconds"    "my"         "no"         "their"      "you're"    
## [11] "together"   "and"        "seen"       "must"       "concerning"
## [16] "me"         "regardless" "really"     "ex"         "becomes"   
## [21] "re"         "ends"       "she's"      "could"      "outside"   
## [26] "sure"       "alone"      "noone"      "tries"      "or"        
## [31] "anywhere"   "done"       "mrs"        "until"      "longest"   
## [36] "we"         "sup"        "others"     "t's"        "would"     
## [41] "is"         "provides"   "novel"      "high"       "they're"   
## [46] "his"        "case"       "associated" "into"       "has"

Remove stop words and compare before and after

# Regex that matches URL-type string
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&amp;|&lt;|&gt;"

words_clean <- rats_chi %>% 
  # drop URLs
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  # Tokenization (word tokens)
  unnest_tokens(word, text, token = "words") %>% 
  # drop stop words
  anti_join(stop_words, by = "word") %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word, "[a-z]"))

# Check the number of rows after removal of the stop words. There should be fewer words now
print(
  glue::glue("Before: {nrow(words)}, After: {nrow(words_clean)}")
)

## Before: 16864, After: 6032

Removed rat, rats, and Chicago from word plot because that was the keyword and subreddit filter

words_clean <- words_clean %>%
  filter(!word %in% c("rat", "rats", "chicago"))

Create a plot and word cloud based on wordcounts

words_clean %>%
  count(word, sort = TRUE) %>%
  top_n(20, n) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "words",
       y = "counts",
       title = "Unique wordcounts")

words_clean %>% 
  count(word, sort = TRUE) %>% 
  wordcloud2()

Get tri-grams

words_ngram <- rats_chi %>%
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  select(text) %>%
  unnest_tokens(output = paired_words,
                input = text,
                token = "ngrams",
                n = 3)

Tri-gram table

words_ngram %>%
  count(paired_words, sort = TRUE) %>% 
  head(20) %>% 
  knitr::kable()

paired_words	n
NA	91
i don t	8
i live in	7
does anyone know	6
i have a	6
in my apartment	6
in the alley	6
a lot of	5
be able to	5
don’t want to	5
i have been	5
if you are	5
rats in the	5
the property manager	5
all of the	4
all over the	4
and i will	4
anyone know of	4
chicago and il	4
i want to	4

Remove stop words from trigram

#separate the paired words into two columns
words_ngram_pair <- words_ngram %>%
  separate(paired_words, c("word1", "word2", "word3"), sep = " ")

# filter rows where there are stop words under word 1 column and word 2 column
words_ngram_pair_filtered <- words_ngram_pair %>%
  # drop stop words
  filter(!word1 %in% stop_words$word & !word2 %in% stop_words$word & !word3 %in% stop_words$word) %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word1, "[a-z]") & str_detect(word2, "[a-z]") & str_detect(word3, "[a-z]"))

# Filter out words that are not encoded in ASCII
# To see what's ASCCII, google 'ASCII table'
words_ngram_pair_filtered %<>% 
  filter(stri_enc_isascii(word1) & stri_enc_isascii(word2) & stri_enc_isascii(word3))

# Sort the new tri-gram (n=3) counts:
words_counts <- words_ngram_pair_filtered %>%
  count(word1, word2, word3) %>%
  arrange(desc(n))

head(words_counts, 20) %>% 
  knitr::kable()

word1	word2	word3	n
archive	january	_tribune_rewind_january_1857	2
chicago	tribune	rewind	2
dead	animal	smell	2
ecotech	pest	control	2
lying	rat	management	2
posting	monthly	highlights	2
unusually	warm	winter	2
1080format	pjpgauto	webps	1
24th	maman	savant	1
3d	tv	xvt3d554sv	1
3x	setting	bait	1
4th	pest	control	1
90s	bulls	team	1
_king	jk	jk	1
_residents_what_can_the_city_do_to_make	utm_source	shareutm_medium	1
_signs_have_popped_up_on_north_side_have_you	wgn	story	1
_tribune_rewind_january_1857	doctors	recommend	1
_tribune_rewind_january_1857	february	_tribune_rewind_february_1857	1
abraxas	anniversaria	owen	1
accessibility	safety	features	1

The top tri-grams that illustrate how Chicago redditors feel about rats are “dead-animal-smell”, “lying-rat-management”, and “unusually-warm-winter”. These particular tri-grams tell a story about the discovery of rats, blaming the source of the problem, and claiming a cause for more rat sightings. “Chicago-tribune-rewind” is probably a notable news article that was popular among Chicagoan’s facing a rat problem.

Visualize word networks

# plot word network
words_counts %>%
  filter(n >= 2) %>%
  graph_from_data_frame() %>% # convert to graph
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = .6, edge_width = n)) +
  geom_node_point(color = "gold", size = 3) +
  geom_node_text(aes(label = name), vjust = 1.8) +
  labs(title = "Word Networks",
       x = "", y = "")

## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Save .csv for Bert analysis in google COLAB

write_csv(rats_chi, "rats_chi.csv")

Import the data processed from the Colab to R.

reddit_sentiment <- read_csv('rats_chi_bert.csv') %>% 
  drop_na('bert_label')

## New names:
## Rows: 194 Columns: 10
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (5): title, text, subreddit, url, bert_label dbl (4): ...1, timestamp,
## comments, bert_score date (1): date_utc
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

Get sentiment scores using the dictionary method for comparison.

reddit_sentiment %<>% 
  mutate(title = replace_na(title, ""),
         text = replace_na(text, ""),
         title_text = str_c(title, text, sep = ". "))

reddit_sentiment_dictionary <- sentiment_by(reddit_sentiment$title_text)

reddit_sentiment$sentiment_dict <- reddit_sentiment_dictionary %>% pull(ave_sentiment)
reddit_sentiment$word_count <- reddit_sentiment_dictionary %>% pull(word_count)

Check the correlation between the sentiment values from two different methods.

reddit_sentiment %<>% mutate(bert_label_numeric = str_sub(bert_label, 1, 1) %>% as.numeric())

cor(reddit_sentiment$bert_label_numeric, reddit_sentiment$sentiment_dict)

## [1] 0.3408039

ggplot(data = reddit_sentiment, aes(x = bert_label_numeric, y = sentiment_dict)) +
  geom_jitter(width = 0.1, height = 0) +
  geom_line(aes(y = 0), color = '#FFD700', lwd = 1, linetype='dashed') + 
  dark_theme_grey()

## Inverted geom defaults of fill and color/colour.
## To change them back, use invert_geom_defaults().

Let’s look at some example threads and the predicted sentiment, and see which method makes more sense.

BERT: 1 star (negative) vs. 5 stars (positive)

bert_example <- reddit_sentiment %>%
  filter(bert_label_numeric %in% c(1,2,3,4,5)) %>% 
  group_by(bert_label) %>%
  arrange(desc(bert_score)) %>%
  slice_head(n = 2) %>%
  ungroup()

# 1 star
bert_example %>% filter(bert_label_numeric == 1) %>% pull(title_text) %>% print()

## [1] "Mice in apartment. We've had an ongoing issue with mice/rats in our apartment since we've moved in. We've submitted maintenance requests, been in contact with the landlord, contacted the exterminator through them, and submitted a Chi311 ticket. It seems nearly impossible to get rid of. Our apartment, especially kitchen, is virtually spotless but nothing is working. They even have those bait traps set up in our unit. The mice seem like they are coming in and doing their thing. We know they eat the bait but we are still having the problem.\n\nI read somewhere there are tenant laws about this...where you have to draft a formal letter to the landlord and they need to fix the issue in 14 days, otherwise you can break the lease and withhold rent. Does anyone know where we actually need to send the formal letter to? Is it an in person thing, or is it an email? If anyone has any information I would appreciate it greatly!! This is the most disgusting thing I have ever dealt with."
## [2] "There are too many pigeons at my train stop. This is getting ridiculous. I live by the Pulaski Orange line and theres maybe over 100 or 200 pigeons living in the underpass. They poop so much!!! Theres literally piles of it from where they sleep. It's starting to smell so bad. \n\nI've been shit on by those damn pigeons like three times already, even when I'm cautious and make sure to look where theyre stationed.\n\nAnyway, I just wanted to rant. I hate pigeons. Stupid flying rats."

# 2 star
bert_example %>% filter(bert_label_numeric == 2) %>% pull(title_text) %>% print()

## [1] "Logan Square, or Surrounding Rat Owners?. Hi, Logan Square and surrounding area peeps... I have had a really rough couple of days: one of two of my pet rats is dying. One day he was totally fine, the next he was not and now, he's in pretty bad shape. \n\nI am posting in the hopes of meeting up with another couple of rat owners for ratty play dates.  You see, rats are very social creatures &amp; my surviving rat would probably enjoy making some new friends as I will not be purchasing a new rat for him.\n\nIn October they would have both been three years old. I feel like that is too old to introduce a new cage mate and after the second rat passes I will be taking a hiatus from rat ownership to try to get pregnant. So the old trick of buying two new rats to join the elderly rat wouldn't really work for us.\n\nSend me a PM if you're interested in having a play date for our furry buddies."
## [2] "I wonder if this would work on the rats that I keep seeing on top/around the garbage bins in the alley? The city is supposedly baiting the alleys but so far it does not seem to make any difference.. "

# 3 star
bert_example %>% filter(bert_label_numeric == 3) %>% pull(title_text) %>% print()

## [1] "Do those black boxes actually work to trap the rats? Most rats I've seen are too big to even fit through the holes.. I've seen some thicc bois around the alleys and I can't imagine them even fitting into those boxes."                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
## [2] "Have you noticed your neighborhood's rat population growing?. Since, I want to say November, my neighbors and I have noticed way more rats around the neighborhood. Even through the winter, there's just been a very large influx of rats all over the place. We're talking in the alley, on the sidewalk, in the streets, probably even in all of our buildings. It wasn't like this in earlier years. Of course you'd see a couple every week or so, but now it's going to the point that I see several every hour I'm outside.\n\nThis trend has me wondering what's going on in other parts of the city. Is it the same out there, are yall seeing more/less? We have seen a few more cats around, which I personally like, but I'm a little worried the increase will bring more snakes and coyotes to the neighborhood. About a month ago my friends and I were hanging at our bonfire spot by the river, and actually saw a group of 3 coyotes run up from the side of the river, up into our hangout area, and then out into the neighborhood. Think they might be coming to the neighborhood to hunt cats &amp; rats."

# 4 star
bert_example %>% filter(bert_label_numeric == 4) %>% pull(title_text) %>% print()

## [1] "There aren't many rats in my neighborhood, but there's definitely one less tonight. Good work kitty cat. https://m.imgur.com/gallery/Ec6fdhs\n\nBtw, he ate the entire thing, minus the tail."
## [2] "Rat Patrol [From 1998 but still a cool article]. "

# 5 star
bert_example %>% filter(bert_label_numeric == 5) %>% pull(title_text) %>% print()

## [1] "We made the list for 10 best cities in the US....For Rats. "                        
## [2] "Awesome piece in Vice on Chicago native and organized labor symbol Scabby The Rat. "

The above sentiment analysis seems to correctly identify the negative threads and the neutral threads, but based on this sample, it is not able to pickup the sarcasm. The first example for the 5 star sentiment “We made the list for 10 best cities in the US….For Rats.” is sarcastic, but the second example, interestingly, is a positive post about a negative topic, “Awesome piece in Vice on Chicago native and organized labor symbol Scabby The Rat.” Scabby the rat is a common site in Chicago, put up at site where there is attempted union busting. Also, the first example of the 2 star sentiment is sad and sentimental, which in a more nuanced nlp may be categorized separately to the deep anger of the 1 star posts, but here it has a ‘negative’ sentiment.

It is interesting to read what is considered a 4 star or a 2 star, and they do express positive and negative sentiment, but they are clearly less extreme than the 1 and 5 stars. Also, based on this sample, it appears that longer posts are more negative… This will be examined further down.

Dictionary method: negative vs. positive

sentimentr_example <- reddit_sentiment %>%
  mutate(sentimentr_abs = abs(sentiment_dict),
         sentimentr_binary = case_when(sentiment_dict > 0 ~ 'positive',
                                       TRUE ~ 'negative')) %>% 
  group_by(sentimentr_binary) %>%
  arrange(desc(sentimentr_abs)) %>%
  slice_head(n = 3) %>%
  ungroup() %>% 
  arrange(sentiment_dict)

# negative
sentimentr_example %>% filter(sentimentr_binary == 'negative') %>% pull(title_text) %>% print()

## [1] "Rat poison likely to blame for deadly side effects of synthetic pot in Illinois. "
## [2] "Meet the cats fighting Chicago\031s rat problem. "                                
## [3] "CPS inspections \030blitz\031 finds rat droppings, bugs, filth in schools. "

# positive
sentimentr_example %>% filter(sentimentr_binary == 'positive') %>% pull(title_text) %>% print()

## [1] "Rats Don't Deserve Death By Dry Ice, PETA Says; They Warrant Our Protection - Downtown. "
## [2] "Rat Patrol [From 1998 but still a cool article]. "                                       
## [3] "This demonic rat never fails to terrorize me. "

This sentiment model seems to be accurate except for the 3rd example of positive sentiment. I’m not sure how “This demonic rat never fails to terrorize me.” ends up positive, but I’m not sure I would categorize it as negative either. This post is an example of sentiment not being the best representation of the content of a particular text.

Number of threads by sentiment category.

reddit_sentiment %>% 
  ggplot(aes(x = bert_label)) + 
  geom_bar(fill = "white") +
  dark_theme_gray()

It is clear that the negative threads dominate the disourse.

Word counts by sentiment category.

reddit_sentiment %>% 
  ggplot(aes(x = bert_label, y = word_count)) +
  geom_jitter(height = 0, width = 0.05) +
  stat_summary(fun = mean, geom = "crossbar", width = 0.4, color = "red") +
  dark_theme_gray()

Here shows that 1 star threads do not on average have the most words, but the posts with the most words are 1 stars. The 2 posts with the highest word count are 1 and 5 stars, indicating that those that feel the most extreme have the most to say.

Association between a thread’s sentiment and the number of comments on the thread.

reddit_sentiment_rm_outlier <- reddit_sentiment %>%
  group_by(bert_label) %>%
  filter(
    between(
      comments, 
      quantile(comments, 0.25) - 1.5 * IQR(comments), 
      quantile(comments, 0.75) + 1.5 * IQR(comments)))

cor.test(reddit_sentiment_rm_outlier$comments, reddit_sentiment_rm_outlier$bert_label_numeric)

## 
##  Pearson's product-moment correlation
## 
## data:  reddit_sentiment_rm_outlier$comments and reddit_sentiment_rm_outlier$bert_label_numeric
## t = 0.027959, df = 166, p-value = 0.9777
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1492888  0.1535294
## sample estimates:
##        cor 
## 0.00217007

reddit_sentiment_rm_outlier %>% 
  ggplot(aes(x = bert_label_numeric, y = comments)) +
  geom_jitter(height = 0, width = 0.05) +
  geom_smooth(method = 'loess', span = 0.75) +
  dark_theme_gray()

## `geom_smooth()` using formula = 'y ~ x'

data("stop_words")
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&amp;|&lt;|&gt;"

reddit_sentiment_clean <- reddit_sentiment %>% 
  mutate(title_text = str_replace_all(title_text, replace_reg, "")) %>%
  unnest_tokens(word, title_text, token = "words") %>% 
  anti_join(stop_words, by = "word") %>% 
  filter(str_detect(word, "[a-z]")) %>% 
  filter(!word %in% c('flu','shot','shots'))

We are not interested in words that are commonly seen in both positive and negative threads. We can identify words that are uniquely seen in either positive or negative threads using anti_join.

reddit_sentiment_clean_negative <- reddit_sentiment_clean %>% 
  filter(bert_label_numeric %in% c(1,2))
reddit_sentiment_clean_positive <- reddit_sentiment_clean %>% 
  filter(bert_label_numeric %in% c(4,5))

reddit_sentiment_clean_negative_unique <- reddit_sentiment_clean_negative %>% 
  anti_join(reddit_sentiment_clean_positive, by = 'word')
reddit_sentiment_clean_positive_unique <- reddit_sentiment_clean_positive %>%
  anti_join(reddit_sentiment_clean_negative, by = 'word')

Words appearing in negative threads

n <- 20
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.6, 1) # vivid
v <- runif(n, 0.3, 0.7) # neither too dark or bright

df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("grey", 10000))

reddit_sentiment_clean_negative_unique %>% 
  count(word, sort = TRUE) %>%
  wordcloud2(color = pal, 
             minRotation = 0, 
             maxRotation = 0, 
             ellipticity = 0.8)

This negative word cloud really starts to get at the issues that people have with rats and why there are so many negative posts. Droppings and bad landlords seem to be a significant issue for Chicagoans with rat problems. The ‘laws’ and ‘pay’ probably speak to Scabby the Rat, while most of the other words relate to the rodent problem.

Words appearing in positive threads

reddit_sentiment_clean_positive_unique %>% 
  count(word, sort = TRUE) %>%
  wordcloud2(color = pal, 
             minRotation = 0, 
             maxRotation = 0, 
             ellipticity = 0.8)

The positive word cloud seems to have less of a clear message and probably includes broader rat topics beyong pest issues and Scabby the union busting rat.

3-3: Temporal analysis

# create new columns: date, year, day_of_week, is_weekend, time
reddit_sentiment %<>% 
  mutate(date = as.POSIXct(date_utc)) %>%
  filter(!is.na(date)) %>% 
  mutate(year = year(date),
         day_of_week = wday(date, label = TRUE),
         is_weekend = ifelse(day_of_week %in% c("Sat", "Sun"), "Weekend", "Weekday"),
         time = timestamp %>% 
           anytime(tz = anytime:::getTZ()) %>% 
           str_split('-| |:') %>% 
           sapply(function(x) as.numeric(x[4])))

Stacked bar plot of sentiment by year.

# sentiment by year
reddit_sentiment %>% 
  ggplot(aes(x = year, fill = bert_label)) +
  geom_bar(position = 'stack') +
  scale_x_continuous(breaks = seq(min(reddit_sentiment$year), 
                                  max(reddit_sentiment$year), 
                                  by = 1)) +
  scale_fill_brewer(palette = 'PuRd', direction = -1) + 
  dark_theme_grey()

There have been multiple articles in the past couple years naming Chicago, “The Rattiest City”. Theses news articles, which can be seen were posted by the presence of ‘Chicago Tribune’ in the tri-gram analysis, have clearly stimulated the conversation online. Looking at the decrease in discourse from 2018 to 2020, and then a clear uptick, I would attest the large proportion of 1 star posts to these increasing rodent issues the city faces.

Set position = 'fill' to see the proportions.

# sentiment by year
reddit_sentiment %>% 
  ggplot(aes(x = year, fill = bert_label)) +
  geom_bar(position = 'fill') +
  scale_x_continuous(breaks = seq(min(reddit_sentiment$year), 
                                  max(reddit_sentiment$year), 
                                  by = 1)) +
  scale_fill_brewer(palette = 'PuRd', direction = -1) + 
  dark_theme_grey()