# using keyword
threads_1 <- find_thread_urls(keywords= "Archery",
sort_by = 'relevance',
period = 'year')
threads_1$subreddit %>% table() %>% sort(decreasing = T) %>% head(10)
# using subreddit
threads_2 <- find_thread_urls(subreddit = 'Archery',
sort_by = 'top',
period = 'year')
write.csv(threads_2, "reddit_threads_info.csv", row.names = FALSE)
# get individual comments
threads_2_content <- get_thread_content(threads_2$url[1:10])
names(threads_2_content)
# check upvotes and downvotes
print(threads_2_content$threads[,c('upvotes','downvotes','up_ratio')])
# create new column: date
threads_2 <- read.csv("reddit_threads_info.csv")
threads_2 %<>%
mutate(date = as.POSIXct(date_utc)) %>%
filter(!is.na(date))
# number of threads by week
threads_2 %>%
ggplot(aes(x = date)) +
geom_histogram(color="black", position = 'stack', binwidth = 60*60*24*7) +
scale_x_datetime(date_labels = "%b %y",
breaks = seq(min(threads_2$date, na.rm = T),
max(threads_2$date, na.rm = T),
by = "1 month")) +
theme_minimal()
# create new columns: day_of_week, is_weekend
threads_2 %<>%
mutate(day_of_week = wday(date, label = TRUE)) %>%
mutate(is_weekend = ifelse(day_of_week %in% c("周六", "周日"), "Weekend", "Weekday"))
# number of threads by time of day
threads_2 %>%
ggplot(aes(x = day_of_week, fill = is_weekend)) +
geom_bar(color = 'black') +
scale_fill_manual(values = c("Weekday" = "gray", "Weekend" = "pink")) +
theme_minimal()
print(threads_2$timestamp[1])
## [1] 1685391012
print(threads_2$timestamp[1] %>% anytime(tz = anytime:::getTZ()))
## [1] "2023-05-29 16:10:12 EDT"
threads_2 %<>%
mutate(time = timestamp %>%
anytime(tz = anytime:::getTZ()) %>%
str_split('-| |:') %>%
sapply(function(x) as.numeric(x[4])))
# number of threads by time of day
threads_2 %>%
ggplot(aes(x = time)) +
geom_histogram(bins = 24, color = 'black') +
scale_x_continuous(breaks = seq(0, 24, by=1)) +
theme_minimal()
# Tokenization (word tokens)
words <- threads_2 %>%
unnest_tokens(output = word, input = text, token = "words")
words %>%
count(word, sort = TRUE) %>%
top_n(20) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "words",
y = "counts",
title = "Unique wordcounts")
## Selecting by n
# load list of stop words - from the tidytext package
data("stop_words")
# view random 50 words
print(stop_words$word[sample(1:nrow(stop_words), 50)])
## [1] "out" "ourselves" "anyways" "whether"
## [5] "overall" "take" "aside" "theirs"
## [9] "high" "up" "somewhere" "if"
## [13] "again" "corresponding" "itself" "wonder"
## [17] "second" "still" "yours" "to"
## [21] "need" "hadn't" "could" "newest"
## [25] "once" "did" "even" "problems"
## [29] "without" "changes" "big" "provides"
## [33] "interest" "c's" "trying" "your"
## [37] "further" "hers" "not" "over"
## [41] "what's" "until" "asked" "during"
## [45] "sent" "never" "hasn't" "quite"
## [49] "very" "they've"
# Regex that matches URL-type string
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&|<|>"
words_clean <- threads_2 %>%
# drop URLs
mutate(text = str_replace_all(text, replace_reg, "")) %>%
# Tokenization (word tokens)
unnest_tokens(word, text, token = "words") %>%
# drop stop words
anti_join(stop_words, by = "word") %>%
# drop non-alphabet-only strings
filter(str_detect(word, "[a-z]"))
# Check the number of rows after removal of the stop words. There should be fewer words now
print(
glue::glue("Before: {nrow(words)}, After: {nrow(words_clean)}")
)
## Before: 18614, After: 6543
words_clean %>%
count(word, sort = TRUE) %>%
top_n(20, n) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "words",
y = "counts",
title = "Unique wordcounts")
n <- 20
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.6, 1) # vivid
v <- runif(n, 0.3, 0.7) # neither too dark or bright
df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("grey", 10000))
words_clean %>%
count(word, sort = TRUE) %>%
wordcloud2(color = pal,
minRotation = 0,
maxRotation = 0,
ellipticity = 0.8)
#get ngrams. You may try playing around with the value of n, n=3 , n=4
words_ngram <- threads_2 %>%
mutate(text = str_replace_all(text, replace_reg, "")) %>%
select(text) %>%
unnest_tokens(output = paired_words,
input = text,
token = "ngrams",
n = 3)
#show ngrams with sorted values
words_ngram %>%
count(paired_words, sort = TRUE) %>%
head(20) %>%
knitr::kable()
| paired_words | n |
|---|---|
| NA | 646 |
| i decided to | 10 |
| it s a | 10 |
| i have a | 9 |
| i ve been | 9 |
| i have been | 8 |
| my first bow | 8 |
| at 20 yards | 7 |
| i don t | 7 |
| i m not | 7 |
| i want to | 7 |
| i wanted to | 7 |
| new to archery | 7 |
| the first time | 7 |
| and i m | 6 |
| out of the | 6 |
| a lot of | 5 |
| arrows in the | 5 |
| as the title | 5 |
| do you guys | 5 |
#separate the paired words into two columns
words_ngram_pair <- words_ngram %>%
separate(paired_words, c("word1", "word2","word3"), sep = " ")
# filter rows where there are stop words under word 1 column and word 2 column
words_ngram_pair_filtered <- words_ngram_pair %>%
# drop stop words
filter(!word1 %in% stop_words$word & !word2 %in% stop_words$word & !word3 %in% stop_words$word ) %>%
# drop non-alphabet-only strings
filter(str_detect(word1, "[a-z]") & str_detect(word2, "[a-z]") & str_detect(word3, "[a-z]"))
# Filter out words that are not encoded in ASCII
# To see what's ASCCII, google 'ASCII table'
library(stringi)
words_ngram_pair_filtered %<>%
filter(stri_enc_isascii(word1) & stri_enc_isascii(word2)& stri_enc_isascii(word3))
# Sort the new tri-gram (n=3) counts:
words_counts <- words_ngram_pair_filtered %>%
count(word1, word2,word3) %>%
arrange(desc(n))
head(words_counts, 20) %>%
knitr::kable()
| word1 | word2 | word3 | n |
|---|---|---|---|
| horse | stall | mat | 2 |
| lbs | draw | weight | 2 |
| limbs | akusta | fotron | 2 |
| mybo | wave | xl | 2 |
| posters | posting | videos | 2 |
| riser | hoyt | arcos | 2 |
| robin | hood | shot | 2 |
| turkey | feather | fletching | 2 |
| turkish | style | bow | 2 |
| wns | motive | fx | 2 |
| 1059format | pjpgauto | webps | 1 |
| 1536format | pjpgauto | webps | 1 |
| 18m | indoor | round | 1 |
| 18m | indoor | shooting | 1 |
| 2604format | pjpgauto | webps | 1 |
| 30lb | recurve | wood | 1 |
| 35lb | buck | trail | 1 |
| 35lb | psg | nighthawk | 1 |
| 38lb | draw | weight | 1 |
| 3d | night | shoots | 1 |
# plot word network
words_counts %>%
filter(n >= 1.5) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = .6, edge_width = n)) +
geom_node_point(color = "darkslategray4", size = 3) +
geom_node_text(aes(label = name), vjust = 1.8) +
labs(title = "Word Networks",
x = "", y = "")
## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.