1. Introduction

The 2026 FIFA World Cup is the first edition to be hosted across three countries — the United States, Canada, and Mexico — and the first to feature 48 national teams. This expanded format has generated enormous global discussion. YouTube, as one of the largest video platforms worldwide, serves as a rich source of public opinion and fan discourse. In this report, we collect and analyze comments from a popular World Cup 2026 video to surface key themes using word frequency analysis and word cloud visualization.

Data source: YouTube Data API v3, accessed via R using an API key. Comments are from a single high-engagement video related to World Cup 2026.


2. Setup

# install.packages(c("httr", "jsonlite", "tidytext", "dplyr", "stringr",
#                    "ggplot2", "wordcloud2", "readr", "dotenv"))

library(httr)
library(jsonlite)
library(tidytext)
library(dplyr)
library(stringr)
library(ggplot2)
library(wordcloud2)
library(readr)

2.1 Load API Key from .env

# Load environment variables from .env file
readRenviron(".env")
api_key <- Sys.getenv("youtube_api_key")

if (nchar(api_key) == 0) stop("API key not found. Check your .env file.")
cat("API key loaded successfully.\n")
## API key loaded successfully.

3. Data Collection

3.1 Target Video

target_video_id    <- "IR891e2-JBc"
target_video_title <- "FIFA World Cup 2026"

cat("Video ID:", target_video_id, "\n")
## Video ID: IR891e2-JBc
cat("URL: https://www.youtube.com/watch?v=", target_video_id, "\n", sep = "")
## URL: https://www.youtube.com/watch?v=IR891e2-JBc

3.2 Scrape Comments (100+)

The Comments Threads API returns up to 100 results per page. We iterate through pages using nextPageToken until we have at least 100 comments.

comments_url <- "https://www.googleapis.com/youtube/v3/commentThreads"

fetch_comments_page <- function(video_id, api_key, page_token = NULL) {
  query_params <- list(
    part       = "snippet",
    videoId    = video_id,
    maxResults = 100,
    textFormat = "plainText",
    key        = api_key
  )
  if (!is.null(page_token)) query_params$pageToken <- page_token

  resp <- GET(url = comments_url, query = query_params)
  content(resp, as = "text", encoding = "UTF-8") |> fromJSON(flatten = TRUE)
}

# Collect at least 100 comments
all_comments <- list()
page_token   <- NULL
total        <- 0

repeat {
  page <- fetch_comments_page(target_video_id, api_key, page_token)

  if (!is.null(page$error)) {
    warning("API error: ", page$error$message)
    break
  }

  items <- page$items
  if (is.null(items) || nrow(items) == 0) break

  # Extract the fields we need
  batch <- tibble(
    comment_id   = items$id,
    author       = items$snippet.topLevelComment.snippet.authorDisplayName,
    text         = items$snippet.topLevelComment.snippet.textOriginal,
    like_count   = items$snippet.topLevelComment.snippet.likeCount,
    published_at = items$snippet.topLevelComment.snippet.publishedAt
  )

  all_comments <- append(all_comments, list(batch))
  total        <- total + nrow(batch)

  cat("Fetched", total, "comments so far...\n")

  page_token <- page$nextPageToken
  if (is.null(page_token) || total >= 100) break
  Sys.sleep(0.3)   # be polite to the API
}
## Fetched 100 comments so far...
comments_raw <- bind_rows(all_comments)
cat("\nTotal comments collected:", nrow(comments_raw), "\n")
## 
## Total comments collected: 100

3.3 Save Raw Data

write_csv(comments_raw, "worldcup2026_comments_raw.csv")
cat("Saved to worldcup2026_comments_raw.csv\n")
## Saved to worldcup2026_comments_raw.csv

4. Data Cleaning

comments_clean <- comments_raw |>
  distinct(comment_id, .keep_all = TRUE) |>          # remove duplicates
  filter(!is.na(text), str_length(text) > 3) |>       # drop empty/trivial
  mutate(
    text         = str_squish(text),                   # collapse whitespace
    published_at = as.POSIXct(published_at,
                               format = "%Y-%m-%dT%H:%M:%SZ", tz = "UTC"),
    like_count   = as.integer(like_count)
  )

cat("Comments after cleaning:", nrow(comments_clean), "\n")
## Comments after cleaning: 99
glimpse(comments_clean)
## Rows: 99
## Columns: 5
## $ comment_id   <chr> "UgwjxVgKBfj3U_iCCIN4AaABAg", "UgzqvFVcofnIa53C1xV4AaABAg…
## $ author       <chr> "@moneyy115", "@AstaIsAMonster", "@bigbwitdathirdeyeopen4…
## $ text         <chr> "I’m not Mexican but dayum Mexicans are hyped asf! 😂", "4…
## $ like_count   <int> 0, 1, 0, 0, 1, 0, 1, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, …
## $ published_at <dttm> 2026-06-27 14:58:46, 2026-06-27 12:40:36, 2026-06-27 07:…

5. Text Analysis

5.1 Tokenize and Remove Stop Words

# Custom stop words relevant to this context
custom_stops <- tibble(word = c("world", "cup", "2026", "fifa", "gonna",
                                 "https", "t.co", "amp", "na", "ya",
                                 "video", "watch", "youtube"))

tokens <- comments_clean |>
  select(comment_id, text) |>
  unnest_tokens(word, text) |>
  filter(str_detect(word, "^[a-z]+$")) |>      # letters only
  anti_join(stop_words, by = "word") |>
  anti_join(custom_stops, by = "word") |>
  filter(str_length(word) > 2)

5.2 Word Frequency

word_freq <- tokens |>
  count(word, sort = TRUE)

cat("Top 20 words:\n")
## Top 20 words:
print(head(word_freq, 20))
## # A tibble: 20 × 2
##    word          n
##    <chr>     <int>
##  1 mexico       31
##  2 mexican      13
##  3 game         12
##  4 viva         12
##  5 team         11
##  6 ochoa        10
##  7 memo          8
##  8 goal          5
##  9 love          5
## 10 beautiful     4
## 11 congrats      4
## 12 country       4
## 13 crazy         4
## 14 players       4
## 15 soccer        4
## 16 stadium       4
## 17 watching      4
## 18 winning       4
## 19 absolute      3
## 20 ball          3

5.3 Visualization — Top 25 Terms

word_freq |>
  slice_max(n, n = 25) |>
  mutate(word = reorder(word, n)) |>
  ggplot(aes(x = n, y = word, fill = n)) +
  geom_col(show.legend = FALSE) +
  scale_fill_gradient(low = "#74c476", high = "#006d2c") +
  labs(
    title    = "Top 25 Most Frequent Words in World Cup 2026 YouTube Comments",
    subtitle = paste("Source video:", target_video_title),
    x        = "Frequency",
    y        = NULL,
    caption  = "Stop words and generic terms removed. Data collected via YouTube Data API v3."
  ) +
  theme_minimal(base_size = 13)

5.4 Word Cloud

wc_data <- word_freq |>
  filter(n >= 2) |>
  slice_max(n, n = 150)

wordcloud2(
  data  = wc_data,
  size  = 0.5,
  color = "random-dark",
  backgroundColor = "white"
)

6. Summary Statistics

cat("=== Dataset Summary ===\n")
## === Dataset Summary ===
cat("Total comments collected: ", nrow(comments_clean), "\n")
## Total comments collected:  99
cat("Unique commenters:        ", n_distinct(comments_clean$author), "\n")
## Unique commenters:         95
cat("Date range:               ",
    format(min(comments_clean$published_at, na.rm = TRUE), "%Y-%m-%d"), "to",
    format(max(comments_clean$published_at, na.rm = TRUE), "%Y-%m-%d"), "\n")
## Date range:                2026-06-25 to 2026-06-27
cat("Avg. likes per comment:   ",
    round(mean(comments_clean$like_count, na.rm = TRUE), 2), "\n")
## Avg. likes per comment:    0.73
cat("Total unique tokens:      ", nrow(word_freq), "\n")
## Total unique tokens:       347
cat("Top 3 words:              ",
    paste(head(word_freq$word, 3), collapse = ", "), "\n")
## Top 3 words:               mexico, mexican, game

7. Interpretation

The word frequency analysis reveals a pretty predictable rank and also one of the reason why the World Cup remains such an enormous event. Terms related to national identity (country names, team references) consistently rank among the highest-frequency words, reflecting the deeply nationalistic character of World Cup discourse. This aligns with Billings et al. (2018), who found that sports mega-events amplify in-group/out-group identity expression on social media platforms.

For this particular chosen video, words surround the player “Memo Ochoa” are prominent because this was his last game before fully retiring from the sport.


8. References

  • Billings, A. C., Butterworth, M. L., & Turman, P. D. (2018). Communication and sport: Surveying the field (3rd ed.). SAGE Publications.
  • Google Developers. (2024). YouTube Data API v3 Reference. https://developers.google.com/youtube/v3
  • Silge, J., & Robinson, D. (2017). Text mining with R: A tidy approach. O’Reilly Media. https://www.tidytextmining.com