Major Homework 3

I am hoping to examine in this assignment the perception of the difficulty/usability of the empire state trail that runs from NYC to Canada from the past 5 years.

threads_2.1 <- read.csv("C:\\Users\\xfitten3\\OneDrive - Georgia Institute of Technology\\Desktop\\Urban_A\\threads_2.csv")

threads_3.1 <- read.csv("C:\\Users\\xfitten3\\OneDrive - Georgia Institute of Technology\\Desktop\\Urban_A\\threads_3.csv")

threads_4.1 <- read.csv("C:\\Users\\xfitten3\\OneDrive - Georgia Institute of Technology\\Desktop\\Urban_A\\threads_4.csv")

subreddit_list <- RedditExtractoR::find_subreddits('EmpireStateTrail')
subreddit_list %>% 
  arrange(desc(subscribers)) %>% 
  .[1:25,c('subreddit','title','subscribers')] %>% 
  knitr::kable()

write.csv(subreddit_list, 'subreddit_list.csv', row.names = FALSE)

# I chose Empirestate trail, bicycle touring, and hudson valley subreddit 

threads_2 <- find_thread_urls(subreddit = 'EmpireStateTrail', 
                              sort_by = 'hot', 
                              period = 'all') %>% 
  drop_na()  %>% 
  mutate(date_utc = as_datetime(date_utc))

threads_2 <- threads_2 %>%
  filter(date_utc >= (Sys.Date() - years(5)))

rownames(threads_2) <- NULL


# Sanitize text
threads_2 %<>% 
  mutate(across(
    where(is.character),
    ~ .x %>%
        str_replace_all("\\|", "/") %>% 
        str_replace_all("\\n", " ") %>%
        str_squish()
  ))


write.csv(threads_2, 'threads_2.csv', row.names = FALSE)

##

threads_3 <- find_thread_urls(keywords= 'empire state trail', 
                              subreddit = 'bicycletouring', 
                              sort_by = 'relevance', 
                              period = 'all') %>% 
  drop_na() %>% 
  mutate(date_utc = as_datetime(date_utc))

threads_3 <- threads_3 %>%
  filter(date_utc >= (Sys.Date() - years(5))) %>%
  filter(str_detect(tolower(title), "empire state trail"))

rownames(threads_3) <- NULL

write.csv(threads_3, 'threads_3.csv', row.names = FALSE)


# Sanitize text
threads_3 %<>% 
  mutate(across(
    where(is.character),
    ~ .x %>%
        str_replace_all("\\|", "/") %>% 
        str_replace_all("\\n", " ") %>%
        str_squish()
  ))

head(threads_3, 3) %>% knitr::kable()

##  This one shouldn't be cleaned for the past two years 

threads_4 <- find_thread_urls(keywords= 'empire state trail', 
                              subreddit = 'hudsonvalley', 
                              sort_by = 'relevance', 
                              period = 'all') %>% 
  drop_na() %>%
  mutate(date_utc = as_datetime(date_utc))

threads_4 <- threads_4 %>%
  filter(date_utc >= (Sys.Date() - years(5))) %>%
  filter(str_detect(tolower(title), "empire state trail"))

rownames(threads_4) <- NULL


write.csv(threads_4, 'threads_4.csv', row.names = FALSE)

# Sanitize text
threads_4 %<>% 
  mutate(across(
    where(is.character),
    ~ .x %>%
        str_replace_all("\\|", "/") %>% 
        str_replace_all("\\n", " ") %>% 
        str_squish() 
  ))


# So the results I got were 28 observations - threads 2 , 48 observations threads 3, 2 observations thread 4. Overall the most useful subreddit seems to be the bicycletouring subreddit with the most comments too.

Getting comments

write.csv(threads_2_content$comments, "threads_2_comments.csv", row.names = FALSE)

# Save threads_3 comments
write.csv(threads_3_content$comments, "threads_3_comments.csv", row.names = FALSE)

# Save threads_4 comments
write.csv(threads_4_content$comments, "threads_4_comments.csv", row.names = FALSE)

threads_2_content <- get_thread_content(threads_2.1$url[1:4])
threads_2_content$comments %<>%
  mutate(across(
    where(is.character),
    ~ .x %>%
        str_replace_all("\\|", "/") %>% 
        str_replace_all("\\n", " ") %>% 
        str_squish() 
  ))

head(threads_2_content$comments, 3) %>% knitr::kable()

##

threads_3_content <- get_thread_content(threads_3.1$url[1:4])
threads_3_content$comments %<>%
  mutate(across(
    where(is.character),
    ~ .x %>%
        str_replace_all("\\|", "/") %>% 
        str_replace_all("\\n", " ") %>% 
        str_squish() 
  ))

head(threads_3_content$comments, 3) %>% knitr::kable()

##

threads_4_content <- get_thread_content(threads_4.1$url[1:4])
threads_4_content$comments %<>%
  mutate(across(
    where(is.character),
    ~ .x %>%
        str_replace_all("\\|", "/") %>% 
        str_replace_all("\\n", " ") %>% 
        str_squish() 
  ))

head(threads_4_content$comments, 3) %>% knitr::kable()

Tokenization

words_1 <- threads_2.1 %>% 
  unnest_tokens(output = word, input = text, token = 'words')

words_1 %>%
  count(word, sort = TRUE) %>%
  top_n(20) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "words",
       y = "counts",
       title = "Unique wordcounts")

## Selecting by n

## 

words_2 <- threads_3.1 %>% 
  unnest_tokens(output = word, input = text, token = 'words') 

words_2 %>%
  count(word, sort = TRUE) %>%
  top_n(20) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "words",
       y = "counts",
       title = "Unique wordcounts")

## Selecting by n

##


words_3 <- threads_4.1 %>% 
  unnest_tokens(output = word, input = text, token = 'words') 

words_3 %>%
  count(word, sort = TRUE) %>%
  top_n(20) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "words",
       y = "counts",
       title = "Unique wordcounts")

## Selecting by n

unc_words <- bind_rows(words_1, words_2, words_3)

##### STOP WORDS #####

replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&amp;|&lt;|&gt;"

words_clean <- threads_2.1 %>% 
  # drop URLs
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  # Tokenization (word tokens)
  unnest_tokens(word, text, token = 'words') %>% 
  # drop stop words
  anti_join(stop_words, by = "word") %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word, "[a-z]"))

# Check the number of rows after removal of the stop words. There should be fewer words now
print(
  glue::glue("Before: {nrow(words)}, After: {nrow(words_clean)}")
)

## 

words_clean_2 <- threads_3.1 %>% 
  # drop URLs
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  # Tokenization (word tokens)
  unnest_tokens(word, text, token = 'words') %>% 
  # drop stop words
  anti_join(stop_words, by = "word") %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word, "[a-z]"))

# Check the number of rows after removal of the stop words. There should be fewer words now
print(
  glue::glue("Before: {nrow(words)}, After: {nrow(words_clean)}")
)

##

words_clean_3 <- threads_4.1 %>% 
  # drop URLs
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  # Tokenization (word tokens)
  unnest_tokens(word, text, token = 'words') %>% 
  # drop stop words
  anti_join(stop_words, by = "word") %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word, "[a-z]"))

# Check the number of rows after removal of the stop words. There should be fewer words now
print(
  glue::glue("Before: {nrow(words)}, After: {nrow(words_clean)}")
)

# binding the three together

words_all <- bind_rows(words_clean, words_clean_2,words_clean_3)

# Visualize

words_all %>%
  count(word, sort = TRUE) %>%
  top_n(20, n) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "words",
       y = "counts",
       title = "Unique wordcounts")

Word Cloud!

unc_words %>% 
  count(word, sort = TRUE) %>% 
  wordcloud2()

words_all %>% 
  count(word, sort = TRUE) %>% 
  wordcloud2()

For some reason trying to post this word cloud after my first one wouldn’t pop up, so i included a screenshot of what this produced

Tri-Gram

all_threads <- bind_rows(threads_2.1, threads_3.1, threads_4.1)


words_trigram <- all_threads %>%
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  select(text) %>%
  unnest_tokens(output = paired_words,
                input = text,
                token = "ngrams",
                n = 2)

words_trigram %>%
  count(paired_words, sort = TRUE) %>% 
  head(20) %>% 
  knitr::kable()

paired_words	n
the trail	57
of the	55
state trail	52
on the	42
empire state	39
in the	39
along the	33
the empire	28
to the	28
for the	21
and i	20
to do	20
the est	19
nyc to	18
so i	18
the way	18
to buffalo	18
to nyc	18
a lot	17
to albany	17

#separate the paired words into two columns
words_ngram_pair <- words_trigram %>%
  separate(paired_words, c("word1", "word2"), sep = " ")

# filter rows where there are stop words under word 1 column and word 2 column
words_ngram_pair_filtered <- words_ngram_pair %>%
  # drop stop words
  filter(!word1 %in% stop_words$word & !word2 %in% stop_words$word) %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word1, "[a-z]") & str_detect(word2, "[a-z]"))

# Filter out words that are not encoded in ASCII
# To see what's ASCII, google 'ASCII table'
library(stringi)
words_ngram_pair_filtered %<>% 
  filter(stri_enc_isascii(word1) & stri_enc_isascii(word2))

# Sort the new bi-gram (n=2) counts:
words_counts <- words_ngram_pair_filtered %>%
  count(word1, word2) %>%
  arrange(desc(n))

head(words_counts, 20) %>% 
  knitr::kable()

word1	word2	n
erie	canal	12
trail	day	10
stone	dust	7
pjpgauto	webps	6
ref	wtd	6
battery	park	5
google	maps	5
mi	lunch	5
road	bike	5
bike	path	4
bike	shop	4
credit	card	4
crushed	stone	4
east	greenbush	4
hudson	valley	4
palatine	bridge	4
seneca	falls	4
width	4032format	4
1440format	pjpgauto	3
bike	paths	3

words_counts %>%
  filter(n >= 3) %>%
  graph_from_data_frame() %>% # convert to graph
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = .6, edge_width = n)) +
  geom_node_point(color = "darkslategray4", size = 3) +
  geom_node_text(aes(label = name), vjust = 1.8) +
  labs(title = "Word Networks",
       x = "", y = "")

## Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `transform` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

So, from this trigram analysis we can see what are the big connections. Obviously the empire state trail would be big. Trail and Day have a 10 connection and are related to conditions and rail. This makes sense since the empire state trial is a rail trail. People might want to do day trips thus are examining the conditions of the trail. The upper right connections of Crushed, Stone, and dust make sense because they are talking about the trial conditions of the empire state trail probably discussing what kind of bike to use. The lower quadrant connection doesn’t make sense, I feel like it’s talking about photos? The other cross that makes sense is paths, shop, bike, road, path. Obviously asking for bike shops, paths, and what roads bikes can go on. Finally, a funny one is stealth camping aka camping in not so legal spots, a common occurrence on the Empire State Trail.

Sentiment Analysis

threads_2_content <- read.csv("C:\\Users\\xfitten3\\OneDrive - Georgia Institute of Technology\\Desktop\\Urban_A\\threads_2_comments.csv")
threads_3_content <- read.csv("C:\\Users\\xfitten3\\OneDrive - Georgia Institute of Technology\\Desktop\\Urban_A\\threads_3_comments.csv")
threads_4_content <- read.csv("C:\\Users\\xfitten3\\OneDrive - Georgia Institute of Technology\\Desktop\\Urban_A\\threads_4_comments.csv")

all_threads_sentiment <- all_threads %>%
  mutate(sentiment = sentiment_by(text)$ave_sentiment)

## This didn't produce the result i was looking for 

threads_2_df <- tibble(comment = threads_2_content$comment) %>% 
  mutate(source = "threads_2")

threads_3_df <- tibble(comment = threads_3_content$comment) %>% 
  mutate(source = "threads_3")

threads_4_df <- tibble(comment = threads_4_content$comment) %>% 
  mutate(source = "threads_4")

# Combine all three tibbles into one
all_comments <- bind_rows(threads_2_df, threads_3_df, threads_4_df)

##

comments_sentiment <- all_comments %>%
  mutate(sentiment = sentiment_by(comment)$ave_sentiment)

comments_sentiment <- comments_sentiment %>%
  mutate(subreddit = case_when(
    source == "threads_2" ~ "Empire State Trail",
    source == "threads_3" ~ "Bicycle Touring",
    source == "threads_4" ~ "Hudson Valley"
  ))

plot(comments_sentiment$sentiment)

hist(
  comments_sentiment$sentiment,
  breaks = 30,                # number of bins
  col = "skyblue",            # color
  border = "white",
  main = "Distribution of Comment Sentiment",
  xlab = "Sentiment Score",
  ylab = "Number of Comments"
)

ggplot(comments_sentiment, aes(x = sentiment, fill = subreddit)) +
  geom_histogram(bins = 30, color = "black", alpha = 0.6) +
  facet_wrap(~ subreddit, scales = "free_y") +  
  theme_minimal() +
  labs(
    title = "Distribution of Comment Sentiment by Subreddit",
    x = "Sentiment Score",
    y = "Number of Comments"
  ) +
  theme(legend.position = "none")

so these three plots i thought were a useful way to do this. The first general plot shows a bit of a downward trend. Overall, the comments remain pretty neutral. When we switch to a histogram, we see that comments hover around neutral, but a lot are minimally negative or minimally positive. I wouldn’t say that the data is skewed left or right, but there are more outliers on the right side giving it that skewed tail look. Finally a faceted distribution shows that the distribution of comments by subreddit. The bicycle touring and hudson valley were the most negative with about 2 comments around the -.4 area. Meanwhile, the empire state trail is largely positive. From this facet too, we can see that the Bicycle touring comments has the most comments probably sweking the data toward the y-axis in the histogram and creating that downward shift in the plot.