Install packages

1. Topic: Changes in sentiment towards ‘Archery’ over the past year.

2. Search Reddit threads using a keyword of ’ ’

Downloading Reddit threads and Specify a subreddit

# using keyword
threads_1 <- find_thread_urls(keywords= "Archery", 
                              sort_by = 'relevance', 
                              period = 'year')

threads_1$subreddit %>% table() %>% sort(decreasing = T) %>% head(10)

# using subreddit
threads_2 <- find_thread_urls(subreddit = 'Archery', 
                              sort_by = 'top', 
                              period = 'year')

write.csv(threads_2, "reddit_threads_info.csv", row.names = FALSE)

Downloading comments and additional information

# get individual comments
threads_2_content <- get_thread_content(threads_2$url[1:10])
names(threads_2_content)

# check upvotes and downvotes
print(threads_2_content$threads[,c('upvotes','downvotes','up_ratio')])

1.3 Analyses on posting date/time

# create new column: date
threads_2 <- read.csv("reddit_threads_info.csv")

threads_2 %<>% 
  mutate(date = as.POSIXct(date_utc)) %>%
  filter(!is.na(date))

# number of threads by week
threads_2 %>% 
  ggplot(aes(x = date)) +
  geom_histogram(color="black", position = 'stack', binwidth = 60*60*24*7) +
  scale_x_datetime(date_labels = "%b %y",
                   breaks = seq(min(threads_2$date, na.rm = T), 
                                max(threads_2$date, na.rm = T), 
                                by = "1 month")) +
  theme_minimal()

# create new columns: day_of_week, is_weekend
threads_2 %<>%  
  mutate(day_of_week = wday(date, label = TRUE)) %>% 
  mutate(is_weekend = ifelse(day_of_week %in% c("周六", "周日"), "Weekend", "Weekday"))

# number of threads by time of day
threads_2 %>% 
  ggplot(aes(x = day_of_week, fill = is_weekend)) +
  geom_bar(color = 'black') +
  scale_fill_manual(values = c("Weekday" = "gray", "Weekend" = "pink")) + 
  theme_minimal()

print(threads_2$timestamp[1])

## [1] 1685391012

print(threads_2$timestamp[1] %>% anytime(tz = anytime:::getTZ()))

## [1] "2023-05-29 16:10:12 EDT"

threads_2 %<>%  
  mutate(time = timestamp %>% 
           anytime(tz = anytime:::getTZ()) %>% 
           str_split('-| |:') %>% 
           sapply(function(x) as.numeric(x[4])))

# number of threads by time of day
threads_2 %>% 
  ggplot(aes(x = time)) +
  geom_histogram(bins = 24, color = 'black') +
  scale_x_continuous(breaks = seq(0, 24, by=1)) + 
  theme_minimal()

3. Tokenization and text data cleaning

3.1 Tokenization

# Tokenization (word tokens)
words <- threads_2 %>% 
  unnest_tokens(output = word, input = text, token = "words")

words %>%
  count(word, sort = TRUE) %>%
  top_n(20) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "words",
       y = "counts",
       title = "Unique wordcounts")

## Selecting by n

2.2 Stop words

# load list of stop words - from the tidytext package
data("stop_words")
# view random 50 words
print(stop_words$word[sample(1:nrow(stop_words), 50)])

##  [1] "out"           "ourselves"     "anyways"       "whether"      
##  [5] "overall"       "take"          "aside"         "theirs"       
##  [9] "high"          "up"            "somewhere"     "if"           
## [13] "again"         "corresponding" "itself"        "wonder"       
## [17] "second"        "still"         "yours"         "to"           
## [21] "need"          "hadn't"        "could"         "newest"       
## [25] "once"          "did"           "even"          "problems"     
## [29] "without"       "changes"       "big"           "provides"     
## [33] "interest"      "c's"           "trying"        "your"         
## [37] "further"       "hers"          "not"           "over"         
## [41] "what's"        "until"         "asked"         "during"       
## [45] "sent"          "never"         "hasn't"        "quite"        
## [49] "very"          "they've"

# Regex that matches URL-type string
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&amp;|&lt;|&gt;"

words_clean <- threads_2 %>% 
  # drop URLs
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  # Tokenization (word tokens)
  unnest_tokens(word, text, token = "words") %>% 
  # drop stop words
  anti_join(stop_words, by = "word") %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word, "[a-z]"))

# Check the number of rows after removal of the stop words. There should be fewer words now
print(
  glue::glue("Before: {nrow(words)}, After: {nrow(words_clean)}")
)

## Before: 18614, After: 6543

words_clean %>%
  count(word, sort = TRUE) %>%
  top_n(20, n) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "words",
       y = "counts",
       title = "Unique wordcounts")

4. Word cloud

n <- 20
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.6, 1) # vivid
v <- runif(n, 0.3, 0.7) # neither too dark or bright

df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("grey", 10000))

words_clean %>% 
  count(word, sort = TRUE) %>% 
  wordcloud2(color = pal, 
             minRotation = 0, 
             maxRotation = 0, 
             ellipticity = 0.8)

5. Tri-gram analysis

Extract tri-grams from text data

#get ngrams. You may try playing around with the value of n, n=3 , n=4
words_ngram <- threads_2 %>%
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  select(text) %>%
  unnest_tokens(output = paired_words,
                input = text,
                token = "ngrams",
                n = 3)

#show ngrams with sorted values
words_ngram %>%
  count(paired_words, sort = TRUE) %>% 
  head(20) %>% 
  knitr::kable()

paired_words	n
NA	646
i decided to	10
it s a	10
i have a	9
i ve been	9
i have been	8
my first bow	8
at 20 yards	7
i don t	7
i m not	7
i want to	7
i wanted to	7
new to archery	7
the first time	7
and i m	6
out of the	6
a lot of	5
arrows in the	5
as the title	5
do you guys	5

Remove tri-grams containing stop words or non-alphabetic terms, present the frequency of tri-grams in a table

#separate the paired words into two columns
words_ngram_pair <- words_ngram %>%
  separate(paired_words, c("word1", "word2","word3"), sep = " ")

# filter rows where there are stop words under word 1 column and word 2 column
words_ngram_pair_filtered <- words_ngram_pair %>%
  # drop stop words
  filter(!word1 %in% stop_words$word & !word2 %in% stop_words$word & !word3 %in% stop_words$word ) %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word1, "[a-z]") & str_detect(word2, "[a-z]") &  str_detect(word3, "[a-z]"))

# Filter out words that are not encoded in ASCII
# To see what's ASCCII, google 'ASCII table'
library(stringi)
words_ngram_pair_filtered %<>% 
  filter(stri_enc_isascii(word1) & stri_enc_isascii(word2)& stri_enc_isascii(word3))

# Sort the new tri-gram (n=3) counts:
words_counts <- words_ngram_pair_filtered %>%
  count(word1, word2,word3) %>%
  arrange(desc(n))

head(words_counts, 20) %>% 
  knitr::kable()

word1	word2	word3	n
horse	stall	mat	2
lbs	draw	weight	2
limbs	akusta	fotron	2
mybo	wave	xl	2
posters	posting	videos	2
riser	hoyt	arcos	2
robin	hood	shot	2
turkey	feather	fletching	2
turkish	style	bow	2
wns	motive	fx	2
1059format	pjpgauto	webps	1
1536format	pjpgauto	webps	1
18m	indoor	round	1
18m	indoor	shooting	1
2604format	pjpgauto	webps	1
30lb	recurve	wood	1
35lb	buck	trail	1
35lb	psg	nighthawk	1
38lb	draw	weight	1
3d	night	shoots	1

Word network

# plot word network
words_counts %>%
  filter(n >= 1.5) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = .6, edge_width = n)) +
  geom_node_point(color = "darkslategray4", size = 3) +
  geom_node_text(aes(label = name), vjust = 1.8) +
  labs(title = "Word Networks",
       x = "", y = "")

## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

UAmajor4

2023-11-29

Install packages

1. Topic: Changes in sentiment towards ‘Archery’ over the past year.

2. Search Reddit threads using a keyword of ’ ’

Downloading Reddit threads and Specify a subreddit

Downloading comments and additional information

1.3 Analyses on posting date/time

3. Tokenization and text data cleaning

3.1 Tokenization

2.2 Stop words

4. Word cloud

5. Tri-gram analysis

Extract tri-grams from text data

Remove tri-grams containing stop words or non-alphabetic terms, present the frequency of tri-grams in a table

Word network