Install packages

1. Topic: Changes in sentiment towards ‘Archery’ over the past year.

2. Search Reddit threads using a keyword of ’ ’

Downloading Reddit threads and Specify a subreddit

# using keyword
threads_1 <- find_thread_urls(keywords= "Archery", 
                              sort_by = 'relevance', 
                              period = 'year')
threads_1$subreddit %>% table() %>% sort(decreasing = T) %>% head(10)
# using subreddit
threads_2 <- find_thread_urls(subreddit = 'Archery', 
                              sort_by = 'top', 
                              period = 'year')

write.csv(threads_2, "reddit_threads_info.csv", row.names = FALSE)

Downloading comments and additional information

# get individual comments
threads_2_content <- get_thread_content(threads_2$url[1:10])
names(threads_2_content)

# check upvotes and downvotes
print(threads_2_content$threads[,c('upvotes','downvotes','up_ratio')])

1.3 Analyses on posting date/time

# create new column: date
threads_2 <- read.csv("reddit_threads_info.csv")

threads_2 %<>% 
  mutate(date = as.POSIXct(date_utc)) %>%
  filter(!is.na(date))

# number of threads by week
threads_2 %>% 
  ggplot(aes(x = date)) +
  geom_histogram(color="black", position = 'stack', binwidth = 60*60*24*7) +
  scale_x_datetime(date_labels = "%b %y",
                   breaks = seq(min(threads_2$date, na.rm = T), 
                                max(threads_2$date, na.rm = T), 
                                by = "1 month")) +
  theme_minimal()

# create new columns: day_of_week, is_weekend
threads_2 %<>%  
  mutate(day_of_week = wday(date, label = TRUE)) %>% 
  mutate(is_weekend = ifelse(day_of_week %in% c("周六", "周日"), "Weekend", "Weekday"))

# number of threads by time of day
threads_2 %>% 
  ggplot(aes(x = day_of_week, fill = is_weekend)) +
  geom_bar(color = 'black') +
  scale_fill_manual(values = c("Weekday" = "gray", "Weekend" = "pink")) + 
  theme_minimal()

print(threads_2$timestamp[1])
## [1] 1685391012
print(threads_2$timestamp[1] %>% anytime(tz = anytime:::getTZ()))
## [1] "2023-05-29 16:10:12 EDT"
threads_2 %<>%  
  mutate(time = timestamp %>% 
           anytime(tz = anytime:::getTZ()) %>% 
           str_split('-| |:') %>% 
           sapply(function(x) as.numeric(x[4])))
# number of threads by time of day
threads_2 %>% 
  ggplot(aes(x = time)) +
  geom_histogram(bins = 24, color = 'black') +
  scale_x_continuous(breaks = seq(0, 24, by=1)) + 
  theme_minimal()

3. Tokenization and text data cleaning

3.1 Tokenization

# Tokenization (word tokens)
words <- threads_2 %>% 
  unnest_tokens(output = word, input = text, token = "words")

words %>%
  count(word, sort = TRUE) %>%
  top_n(20) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "words",
       y = "counts",
       title = "Unique wordcounts")
## Selecting by n

2.2 Stop words

# load list of stop words - from the tidytext package
data("stop_words")
# view random 50 words
print(stop_words$word[sample(1:nrow(stop_words), 50)])
##  [1] "out"           "ourselves"     "anyways"       "whether"      
##  [5] "overall"       "take"          "aside"         "theirs"       
##  [9] "high"          "up"            "somewhere"     "if"           
## [13] "again"         "corresponding" "itself"        "wonder"       
## [17] "second"        "still"         "yours"         "to"           
## [21] "need"          "hadn't"        "could"         "newest"       
## [25] "once"          "did"           "even"          "problems"     
## [29] "without"       "changes"       "big"           "provides"     
## [33] "interest"      "c's"           "trying"        "your"         
## [37] "further"       "hers"          "not"           "over"         
## [41] "what's"        "until"         "asked"         "during"       
## [45] "sent"          "never"         "hasn't"        "quite"        
## [49] "very"          "they've"
# Regex that matches URL-type string
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&amp;|&lt;|&gt;"

words_clean <- threads_2 %>% 
  # drop URLs
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  # Tokenization (word tokens)
  unnest_tokens(word, text, token = "words") %>% 
  # drop stop words
  anti_join(stop_words, by = "word") %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word, "[a-z]"))

# Check the number of rows after removal of the stop words. There should be fewer words now
print(
  glue::glue("Before: {nrow(words)}, After: {nrow(words_clean)}")
)
## Before: 18614, After: 6543
words_clean %>%
  count(word, sort = TRUE) %>%
  top_n(20, n) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "words",
       y = "counts",
       title = "Unique wordcounts")

4. Word cloud

n <- 20
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.6, 1) # vivid
v <- runif(n, 0.3, 0.7) # neither too dark or bright

df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("grey", 10000))
words_clean %>% 
  count(word, sort = TRUE) %>% 
  wordcloud2(color = pal, 
             minRotation = 0, 
             maxRotation = 0, 
             ellipticity = 0.8)

5. Tri-gram analysis

Extract tri-grams from text data

#get ngrams. You may try playing around with the value of n, n=3 , n=4
words_ngram <- threads_2 %>%
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  select(text) %>%
  unnest_tokens(output = paired_words,
                input = text,
                token = "ngrams",
                n = 3)
#show ngrams with sorted values
words_ngram %>%
  count(paired_words, sort = TRUE) %>% 
  head(20) %>% 
  knitr::kable()
paired_words n
NA 646
i decided to 10
it s a 10
i have a 9
i ve been 9
i have been 8
my first bow 8
at 20 yards 7
i don t 7
i m not 7
i want to 7
i wanted to 7
new to archery 7
the first time 7
and i m 6
out of the 6
a lot of 5
arrows in the 5
as the title 5
do you guys 5

Remove tri-grams containing stop words or non-alphabetic terms, present the frequency of tri-grams in a table

#separate the paired words into two columns
words_ngram_pair <- words_ngram %>%
  separate(paired_words, c("word1", "word2","word3"), sep = " ")

# filter rows where there are stop words under word 1 column and word 2 column
words_ngram_pair_filtered <- words_ngram_pair %>%
  # drop stop words
  filter(!word1 %in% stop_words$word & !word2 %in% stop_words$word & !word3 %in% stop_words$word ) %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word1, "[a-z]") & str_detect(word2, "[a-z]") &  str_detect(word3, "[a-z]"))

# Filter out words that are not encoded in ASCII
# To see what's ASCCII, google 'ASCII table'
library(stringi)
words_ngram_pair_filtered %<>% 
  filter(stri_enc_isascii(word1) & stri_enc_isascii(word2)& stri_enc_isascii(word3))

# Sort the new tri-gram (n=3) counts:
words_counts <- words_ngram_pair_filtered %>%
  count(word1, word2,word3) %>%
  arrange(desc(n))

head(words_counts, 20) %>% 
  knitr::kable()
word1 word2 word3 n
horse stall mat 2
lbs draw weight 2
limbs akusta fotron 2
mybo wave xl 2
posters posting videos 2
riser hoyt arcos 2
robin hood shot 2
turkey feather fletching 2
turkish style bow 2
wns motive fx 2
1059format pjpgauto webps 1
1536format pjpgauto webps 1
18m indoor round 1
18m indoor shooting 1
2604format pjpgauto webps 1
30lb recurve wood 1
35lb buck trail 1
35lb psg nighthawk 1
38lb draw weight 1
3d night shoots 1

Word network

# plot word network
words_counts %>%
  filter(n >= 1.5) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = .6, edge_width = n)) +
  geom_node_point(color = "darkslategray4", size = 3) +
  geom_node_text(aes(label = name), vjust = 1.8) +
  labs(title = "Word Networks",
       x = "", y = "")
## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.