1. Setup

# install.packages(c("tuber", "tidytext", "dplyr", "stringr",
#                     "ggplot2", "wordcloud2", "readr"))

library(tuber)
library(tidytext)
library(dplyr)
library(stringr)
library(ggplot2)
library(wordcloud2)
library(readr)

1.1 Authenticate

Sys.setenv(YOUTUBE_CLIENT_ID = "your_client_id_here",
           YOUTUBE_CLIENT_SECRET = "your_client_secret_here")

yt_oauth(app_id     = Sys.getenv("YOUTUBE_CLIENT_ID"),
         app_secret = Sys.getenv("YOUTUBE_CLIENT_SECRET"))

2. Target Video

target_video_id    <- "JSYjv7QprJw"
target_video_title <- "Action Bronson Eats with NYC Mayor Zohran Mamdani: FTD"

cat("Video ID:", target_video_id, "\n")
## Video ID: JSYjv7QprJw
cat("URL: https://www.youtube.com/watch?v=", target_video_id, "\n", sep = "")
## URL: https://www.youtube.com/watch?v=JSYjv7QprJw

3. Scrape Comments (100+)

get_all_comments() handles pagination automatically.

comments_raw_list <- get_all_comments(video_id = target_video_id)

comments_raw <- comments_raw_list %>%
  as_tibble() %>%
  transmute(
    comment_id   = textOriginal,         # placeholder, see note below
    author       = authorDisplayName,
    text         = textOriginal,
    like_count   = as.integer(likeCount),
    published_at = publishedAt
  )

cat("\nTotal comments collected:", nrow(comments_raw), "\n")

3.1 Save Raw Data

write_csv(comments_raw, "youtube_comments_raw.csv")
cat("Saved to youtube_comments_raw.csv\n")

4. Data Cleaning

comments_clean <- comments_raw %>%
  distinct(text, author, .keep_all = TRUE) %>%
  filter(!is.na(text), str_length(text) > 3) %>%
  mutate(
    text         = str_squish(text),
    published_at = as.POSIXct(published_at,
                               format = "%Y-%m-%dT%H:%M:%SZ", tz = "UTC"),
    like_count   = as.integer(like_count)
  )

cat("Comments after cleaning:", nrow(comments_clean), "\n")
## Comments after cleaning: 1858
glimpse(comments_clean)
## Rows: 1,858
## Columns: 5
## $ comment_id   <chr> "That was wholesome AF, thanks I needed them smiles and l…
## $ author       <chr> "@abegending", "@antondoty5552", "@jamesripley4168", "@ju…
## $ text         <chr> "That was wholesome AF, thanks I needed them smiles and l…
## $ like_count   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 1, 1, 0, 0, 0, …
## $ published_at <dttm> 2026-06-30 00:44:23, 2026-06-29 17:38:29, 2026-06-29 14:…

5. Text Analysis

5.1 Tokenize and Remove Stop Words

custom_stops <- tibble(word = c("https", "t.co", "amp", "na", "ya",
                                 "video", "watch", "youtube"))

tokens <- comments_clean %>%
  select(author, text) %>%
  unnest_tokens(word, text) %>%
  filter(str_detect(word, "^[a-z]+$")) %>%
  anti_join(stop_words, by = "word") %>%
  anti_join(custom_stops, by = "word") %>%
  filter(str_length(word) > 2)

5.2 Word Frequency

word_freq <- tokens %>%
  count(word, sort = TRUE)

cat("Top 20 words:\n")
## Top 20 words:
print(head(word_freq, 20))
## # A tibble: 20 × 2
##    word        n
##    <chr>   <int>
##  1 action    196
##  2 mayor     142
##  3 love      116
##  4 mamdani   114
##  5 bronson   103
##  6 food       94
##  7 people     78
##  8 nyc        65
##  9 guy        61
## 10 episode    52
## 11 york       52
## 12 zohran     49
## 13 bro        48
## 14 dude       48
## 15 lol        47
## 16 time       41
## 17 lmao       35
## 18 city       34
## 19 korea      34
## 20 real       33

5.3 Visualization — Top 25 Terms

word_freq %>%
  slice_max(n, n = 25) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = n, y = word, fill = n)) +
  geom_col(show.legend = FALSE) +
  scale_fill_gradient(low = "#74c476", high = "#006d2c") +
  labs(
    title    = "Top 25 Most Frequent Words in YouTube Comments",
    subtitle = paste("Source video:", target_video_title),
    x        = "Frequency",
    y        = NULL,
    caption  = "Stop words and generic terms removed. Data collected via tuber/YouTube Data API v3."
  ) +
  theme_minimal(base_size = 13)

5.4 Word Cloud

wc_data <- word_freq %>%
  filter(n >= 2) %>%
  slice_max(n, n = 150)

wordcloud2(
  data            = wc_data,
  size            = 0.5,
  color           = "random-dark",
  backgroundColor = "white"
)

6. Summary Statistics

cat("=== Dataset Summary ===\n")
## === Dataset Summary ===
cat("Total comments collected: ", nrow(comments_clean), "\n")
## Total comments collected:  1858
cat("Unique commenters:        ", n_distinct(comments_clean$author), "\n")
## Unique commenters:         1568
cat("Date range:               ",
    format(min(comments_clean$published_at, na.rm = TRUE), "%Y-%m-%d"), "to",
    format(max(comments_clean$published_at, na.rm = TRUE), "%Y-%m-%d"), "\n")
## Date range:                2026-03-30 to 2026-06-30
cat("Avg. likes per comment:   ",
    round(mean(comments_clean$like_count, na.rm = TRUE), 2), "\n")
## Avg. likes per comment:    32.19
cat("Total unique tokens:      ", nrow(word_freq), "\n")
## Total unique tokens:       3402
cat("Top 3 words:              ",
    paste(head(word_freq$word, 3), collapse = ", "), "\n")
## Top 3 words:               action, mayor, love

7. Interpretation

This video features Action Bronson, a rapper and cultural figure known for blending hip hop with food culture (while repping New York City culture), as he reviews a Yemenese restaurant in New York City with the new mayor. The video explores themes of New York diversity, personal experience, and culinary storytelling rather than explicit political commentary. With 931 thousand views, 38 thousand likes, and 2,432 total comments, the engagement metrics suggest moderate but meaningful audience interest.

The word frequency analysis reveals that top terms like “mayor,” “mamdani,” “zohran,” “bronson,” and “nyc” directly align with the video’s content — names of key figures and geographic identity. However, the prevalence of words like “action,” “food,” “love,” “people,” and “episode” suggests commenters are engaging with both the personality-driven and culinary aspects of the content. The word cloud visualization emphasizes these dominant themes, indicating that viewers are discussing the show’s format and the cultural/geographic elements rather than critiquing or debating.

One notable pattern is the relative absence of negative sentiment words in the top frequencies, suggesting that comments skew positive or neutral — viewers seem to be appreciating the cultural narrative and food exploration rather than expressing criticism. This aligns with the high like-to-comment ratio (38K likes to 2.4K comments), which often indicates satisfied, engaged viewers.