I am hoping to examine in this assignment the perception of the difficulty/usability of the empire state trail that runs from NYC to Canada from the past 5 years.

threads_2.1 <- read.csv("C:\\Users\\xfitten3\\OneDrive - Georgia Institute of Technology\\Desktop\\Urban_A\\threads_2.csv")

threads_3.1 <- read.csv("C:\\Users\\xfitten3\\OneDrive - Georgia Institute of Technology\\Desktop\\Urban_A\\threads_3.csv")

threads_4.1 <- read.csv("C:\\Users\\xfitten3\\OneDrive - Georgia Institute of Technology\\Desktop\\Urban_A\\threads_4.csv")
subreddit_list <- RedditExtractoR::find_subreddits('EmpireStateTrail')
subreddit_list %>% 
  arrange(desc(subscribers)) %>% 
  .[1:25,c('subreddit','title','subscribers')] %>% 
  knitr::kable()

write.csv(subreddit_list, 'subreddit_list.csv', row.names = FALSE)

# I chose Empirestate trail, bicycle touring, and hudson valley subreddit 

threads_2 <- find_thread_urls(subreddit = 'EmpireStateTrail', 
                              sort_by = 'hot', 
                              period = 'all') %>% 
  drop_na()  %>% 
  mutate(date_utc = as_datetime(date_utc))

threads_2 <- threads_2 %>%
  filter(date_utc >= (Sys.Date() - years(5)))

rownames(threads_2) <- NULL


# Sanitize text
threads_2 %<>% 
  mutate(across(
    where(is.character),
    ~ .x %>%
        str_replace_all("\\|", "/") %>% 
        str_replace_all("\\n", " ") %>%
        str_squish()
  ))


write.csv(threads_2, 'threads_2.csv', row.names = FALSE)

##

threads_3 <- find_thread_urls(keywords= 'empire state trail', 
                              subreddit = 'bicycletouring', 
                              sort_by = 'relevance', 
                              period = 'all') %>% 
  drop_na() %>% 
  mutate(date_utc = as_datetime(date_utc))

threads_3 <- threads_3 %>%
  filter(date_utc >= (Sys.Date() - years(5))) %>%
  filter(str_detect(tolower(title), "empire state trail"))

rownames(threads_3) <- NULL

write.csv(threads_3, 'threads_3.csv', row.names = FALSE)


# Sanitize text
threads_3 %<>% 
  mutate(across(
    where(is.character),
    ~ .x %>%
        str_replace_all("\\|", "/") %>% 
        str_replace_all("\\n", " ") %>%
        str_squish()
  ))

head(threads_3, 3) %>% knitr::kable()

##  This one shouldn't be cleaned for the past two years 

threads_4 <- find_thread_urls(keywords= 'empire state trail', 
                              subreddit = 'hudsonvalley', 
                              sort_by = 'relevance', 
                              period = 'all') %>% 
  drop_na() %>%
  mutate(date_utc = as_datetime(date_utc))

threads_4 <- threads_4 %>%
  filter(date_utc >= (Sys.Date() - years(5))) %>%
  filter(str_detect(tolower(title), "empire state trail"))

rownames(threads_4) <- NULL


write.csv(threads_4, 'threads_4.csv', row.names = FALSE)

# Sanitize text
threads_4 %<>% 
  mutate(across(
    where(is.character),
    ~ .x %>%
        str_replace_all("\\|", "/") %>% 
        str_replace_all("\\n", " ") %>% 
        str_squish() 
  ))


# So the results I got were 28 observations - threads 2 , 48 observations threads 3, 2 observations thread 4. Overall the most useful subreddit seems to be the bicycletouring subreddit with the most comments too. 

Getting comments

write.csv(threads_2_content$comments, "threads_2_comments.csv", row.names = FALSE)

# Save threads_3 comments
write.csv(threads_3_content$comments, "threads_3_comments.csv", row.names = FALSE)

# Save threads_4 comments
write.csv(threads_4_content$comments, "threads_4_comments.csv", row.names = FALSE)
threads_2_content <- get_thread_content(threads_2.1$url[1:4])
threads_2_content$comments %<>%
  mutate(across(
    where(is.character),
    ~ .x %>%
        str_replace_all("\\|", "/") %>% 
        str_replace_all("\\n", " ") %>% 
        str_squish() 
  ))

head(threads_2_content$comments, 3) %>% knitr::kable()

##

threads_3_content <- get_thread_content(threads_3.1$url[1:4])
threads_3_content$comments %<>%
  mutate(across(
    where(is.character),
    ~ .x %>%
        str_replace_all("\\|", "/") %>% 
        str_replace_all("\\n", " ") %>% 
        str_squish() 
  ))

head(threads_3_content$comments, 3) %>% knitr::kable()

##

threads_4_content <- get_thread_content(threads_4.1$url[1:4])
threads_4_content$comments %<>%
  mutate(across(
    where(is.character),
    ~ .x %>%
        str_replace_all("\\|", "/") %>% 
        str_replace_all("\\n", " ") %>% 
        str_squish() 
  ))

head(threads_4_content$comments, 3) %>% knitr::kable()

Tokenization

words_1 <- threads_2.1 %>% 
  unnest_tokens(output = word, input = text, token = 'words')

words_1 %>%
  count(word, sort = TRUE) %>%
  top_n(20) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "words",
       y = "counts",
       title = "Unique wordcounts")
## Selecting by n

## 

words_2 <- threads_3.1 %>% 
  unnest_tokens(output = word, input = text, token = 'words') 

words_2 %>%
  count(word, sort = TRUE) %>%
  top_n(20) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "words",
       y = "counts",
       title = "Unique wordcounts")
## Selecting by n

##


words_3 <- threads_4.1 %>% 
  unnest_tokens(output = word, input = text, token = 'words') 

words_3 %>%
  count(word, sort = TRUE) %>%
  top_n(20) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "words",
       y = "counts",
       title = "Unique wordcounts")
## Selecting by n

unc_words <- bind_rows(words_1, words_2, words_3)

##### STOP WORDS #####

replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&amp;|&lt;|&gt;"

words_clean <- threads_2.1 %>% 
  # drop URLs
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  # Tokenization (word tokens)
  unnest_tokens(word, text, token = 'words') %>% 
  # drop stop words
  anti_join(stop_words, by = "word") %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word, "[a-z]"))

# Check the number of rows after removal of the stop words. There should be fewer words now
print(
  glue::glue("Before: {nrow(words)}, After: {nrow(words_clean)}")
)

## 

words_clean_2 <- threads_3.1 %>% 
  # drop URLs
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  # Tokenization (word tokens)
  unnest_tokens(word, text, token = 'words') %>% 
  # drop stop words
  anti_join(stop_words, by = "word") %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word, "[a-z]"))

# Check the number of rows after removal of the stop words. There should be fewer words now
print(
  glue::glue("Before: {nrow(words)}, After: {nrow(words_clean)}")
)

##

words_clean_3 <- threads_4.1 %>% 
  # drop URLs
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  # Tokenization (word tokens)
  unnest_tokens(word, text, token = 'words') %>% 
  # drop stop words
  anti_join(stop_words, by = "word") %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word, "[a-z]"))

# Check the number of rows after removal of the stop words. There should be fewer words now
print(
  glue::glue("Before: {nrow(words)}, After: {nrow(words_clean)}")
)

# binding the three together

words_all <- bind_rows(words_clean, words_clean_2,words_clean_3)

# Visualize

words_all %>%
  count(word, sort = TRUE) %>%
  top_n(20, n) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "words",
       y = "counts",
       title = "Unique wordcounts")

Word Cloud!

unc_words %>% 
  count(word, sort = TRUE) %>% 
  wordcloud2()
words_all %>% 
  count(word, sort = TRUE) %>% 
  wordcloud2()

For some reason trying to post this word cloud after my first one wouldn’t pop up, so i included a screenshot of what this produced

Tri-Gram

all_threads <- bind_rows(threads_2.1, threads_3.1, threads_4.1)


words_trigram <- all_threads %>%
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  select(text) %>%
  unnest_tokens(output = paired_words,
                input = text,
                token = "ngrams",
                n = 2)

words_trigram %>%
  count(paired_words, sort = TRUE) %>% 
  head(20) %>% 
  knitr::kable()
paired_words n
the trail 57
of the 55
state trail 52
on the 42
empire state 39
in the 39
along the 33
the empire 28
to the 28
for the 21
and i 20
to do 20
the est 19
nyc to 18
so i 18
the way 18
to buffalo 18
to nyc 18
a lot 17
to albany 17
#separate the paired words into two columns
words_ngram_pair <- words_trigram %>%
  separate(paired_words, c("word1", "word2"), sep = " ")

# filter rows where there are stop words under word 1 column and word 2 column
words_ngram_pair_filtered <- words_ngram_pair %>%
  # drop stop words
  filter(!word1 %in% stop_words$word & !word2 %in% stop_words$word) %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word1, "[a-z]") & str_detect(word2, "[a-z]"))

# Filter out words that are not encoded in ASCII
# To see what's ASCII, google 'ASCII table'
library(stringi)
words_ngram_pair_filtered %<>% 
  filter(stri_enc_isascii(word1) & stri_enc_isascii(word2))

# Sort the new bi-gram (n=2) counts:
words_counts <- words_ngram_pair_filtered %>%
  count(word1, word2) %>%
  arrange(desc(n))

head(words_counts, 20) %>% 
  knitr::kable()
word1 word2 n
erie canal 12
trail day 10
stone dust 7
pjpgauto webps 6
ref wtd 6
battery park 5
google maps 5
mi lunch 5
road bike 5
bike path 4
bike shop 4
credit card 4
crushed stone 4
east greenbush 4
hudson valley 4
palatine bridge 4
seneca falls 4
width 4032format 4
1440format pjpgauto 3
bike paths 3
words_counts %>%
  filter(n >= 3) %>%
  graph_from_data_frame() %>% # convert to graph
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = .6, edge_width = n)) +
  geom_node_point(color = "darkslategray4", size = 3) +
  geom_node_text(aes(label = name), vjust = 1.8) +
  labs(title = "Word Networks",
       x = "", y = "")
## Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `transform` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

So, from this trigram analysis we can see what are the big connections. Obviously the empire state trail would be big. Trail and Day have a 10 connection and are related to conditions and rail. This makes sense since the empire state trial is a rail trail. People might want to do day trips thus are examining the conditions of the trail. The upper right connections of Crushed, Stone, and dust make sense because they are talking about the trial conditions of the empire state trail probably discussing what kind of bike to use. The lower quadrant connection doesn’t make sense, I feel like it’s talking about photos? The other cross that makes sense is paths, shop, bike, road, path. Obviously asking for bike shops, paths, and what roads bikes can go on. Finally, a funny one is stealth camping aka camping in not so legal spots, a common occurrence on the Empire State Trail.

Sentiment Analysis

threads_2_content <- read.csv("C:\\Users\\xfitten3\\OneDrive - Georgia Institute of Technology\\Desktop\\Urban_A\\threads_2_comments.csv")
threads_3_content <- read.csv("C:\\Users\\xfitten3\\OneDrive - Georgia Institute of Technology\\Desktop\\Urban_A\\threads_3_comments.csv")
threads_4_content <- read.csv("C:\\Users\\xfitten3\\OneDrive - Georgia Institute of Technology\\Desktop\\Urban_A\\threads_4_comments.csv")
all_threads_sentiment <- all_threads %>%
  mutate(sentiment = sentiment_by(text)$ave_sentiment)

## This didn't produce the result i was looking for 

threads_2_df <- tibble(comment = threads_2_content$comment) %>% 
  mutate(source = "threads_2")

threads_3_df <- tibble(comment = threads_3_content$comment) %>% 
  mutate(source = "threads_3")

threads_4_df <- tibble(comment = threads_4_content$comment) %>% 
  mutate(source = "threads_4")

# Combine all three tibbles into one
all_comments <- bind_rows(threads_2_df, threads_3_df, threads_4_df)

##

comments_sentiment <- all_comments %>%
  mutate(sentiment = sentiment_by(comment)$ave_sentiment)

comments_sentiment <- comments_sentiment %>%
  mutate(subreddit = case_when(
    source == "threads_2" ~ "Empire State Trail",
    source == "threads_3" ~ "Bicycle Touring",
    source == "threads_4" ~ "Hudson Valley"
  ))

plot(comments_sentiment$sentiment)

hist(
  comments_sentiment$sentiment,
  breaks = 30,                # number of bins
  col = "skyblue",            # color
  border = "white",
  main = "Distribution of Comment Sentiment",
  xlab = "Sentiment Score",
  ylab = "Number of Comments"
)

ggplot(comments_sentiment, aes(x = sentiment, fill = subreddit)) +
  geom_histogram(bins = 30, color = "black", alpha = 0.6) +
  facet_wrap(~ subreddit, scales = "free_y") +  
  theme_minimal() +
  labs(
    title = "Distribution of Comment Sentiment by Subreddit",
    x = "Sentiment Score",
    y = "Number of Comments"
  ) +
  theme(legend.position = "none")


so these three plots i thought were a useful way to do this. The first general plot shows a bit of a downward trend. Overall, the comments remain pretty neutral. When we switch to a histogram, we see that comments hover around neutral, but a lot are minimally negative or minimally positive. I wouldn’t say that the data is skewed left or right, but there are more outliers on the right side giving it that skewed tail look. Finally a faceted distribution shows that the distribution of comments by subreddit. The bicycle touring and hudson valley were the most negative with about 2 comments around the -.4 area. Meanwhile, the empire state trail is largely positive. From this facet too, we can see that the Bicycle touring comments has the most comments probably sweking the data toward the y-axis in the histogram and creating that downward shift in the plot.