The 2026 FIFA World Cup is the first edition to be hosted across three countries — the United States, Canada, and Mexico — and the first to feature 48 national teams. This expanded format has generated enormous global discussion. YouTube, as one of the largest video platforms worldwide, serves as a rich source of public opinion and fan discourse. In this report, we collect and analyze comments from a popular World Cup 2026 video to surface key themes using word frequency analysis and word cloud visualization.
Data source: YouTube Data API v3, accessed via R using an API key. Comments are from a single high-engagement video related to World Cup 2026.
# install.packages(c("httr", "jsonlite", "tidytext", "dplyr", "stringr",
# "ggplot2", "wordcloud2", "readr", "dotenv"))
library(httr)
library(jsonlite)
library(tidytext)
library(dplyr)
library(stringr)
library(ggplot2)
library(wordcloud2)
library(readr)
.env# Load environment variables from .env file
readRenviron(".env")
api_key <- Sys.getenv("youtube_api_key")
if (nchar(api_key) == 0) stop("API key not found. Check your .env file.")
cat("API key loaded successfully.\n")
## API key loaded successfully.
target_video_id <- "IR891e2-JBc"
target_video_title <- "FIFA World Cup 2026"
cat("Video ID:", target_video_id, "\n")
## Video ID: IR891e2-JBc
cat("URL: https://www.youtube.com/watch?v=", target_video_id, "\n", sep = "")
## URL: https://www.youtube.com/watch?v=IR891e2-JBc
The Comments Threads API returns up to 100 results per page. We
iterate through pages using nextPageToken until we have at
least 100 comments.
comments_url <- "https://www.googleapis.com/youtube/v3/commentThreads"
fetch_comments_page <- function(video_id, api_key, page_token = NULL) {
query_params <- list(
part = "snippet",
videoId = video_id,
maxResults = 100,
textFormat = "plainText",
key = api_key
)
if (!is.null(page_token)) query_params$pageToken <- page_token
resp <- GET(url = comments_url, query = query_params)
content(resp, as = "text", encoding = "UTF-8") |> fromJSON(flatten = TRUE)
}
# Collect at least 100 comments
all_comments <- list()
page_token <- NULL
total <- 0
repeat {
page <- fetch_comments_page(target_video_id, api_key, page_token)
if (!is.null(page$error)) {
warning("API error: ", page$error$message)
break
}
items <- page$items
if (is.null(items) || nrow(items) == 0) break
# Extract the fields we need
batch <- tibble(
comment_id = items$id,
author = items$snippet.topLevelComment.snippet.authorDisplayName,
text = items$snippet.topLevelComment.snippet.textOriginal,
like_count = items$snippet.topLevelComment.snippet.likeCount,
published_at = items$snippet.topLevelComment.snippet.publishedAt
)
all_comments <- append(all_comments, list(batch))
total <- total + nrow(batch)
cat("Fetched", total, "comments so far...\n")
page_token <- page$nextPageToken
if (is.null(page_token) || total >= 100) break
Sys.sleep(0.3) # be polite to the API
}
## Fetched 100 comments so far...
comments_raw <- bind_rows(all_comments)
cat("\nTotal comments collected:", nrow(comments_raw), "\n")
##
## Total comments collected: 100
write_csv(comments_raw, "worldcup2026_comments_raw.csv")
cat("Saved to worldcup2026_comments_raw.csv\n")
## Saved to worldcup2026_comments_raw.csv
comments_clean <- comments_raw |>
distinct(comment_id, .keep_all = TRUE) |> # remove duplicates
filter(!is.na(text), str_length(text) > 3) |> # drop empty/trivial
mutate(
text = str_squish(text), # collapse whitespace
published_at = as.POSIXct(published_at,
format = "%Y-%m-%dT%H:%M:%SZ", tz = "UTC"),
like_count = as.integer(like_count)
)
cat("Comments after cleaning:", nrow(comments_clean), "\n")
## Comments after cleaning: 99
glimpse(comments_clean)
## Rows: 99
## Columns: 5
## $ comment_id <chr> "UgwjxVgKBfj3U_iCCIN4AaABAg", "UgzqvFVcofnIa53C1xV4AaABAg…
## $ author <chr> "@moneyy115", "@AstaIsAMonster", "@bigbwitdathirdeyeopen4…
## $ text <chr> "I’m not Mexican but dayum Mexicans are hyped asf! 😂", "4…
## $ like_count <int> 0, 1, 0, 0, 1, 0, 1, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, …
## $ published_at <dttm> 2026-06-27 14:58:46, 2026-06-27 12:40:36, 2026-06-27 07:…
# Custom stop words relevant to this context
custom_stops <- tibble(word = c("world", "cup", "2026", "fifa", "gonna",
"https", "t.co", "amp", "na", "ya",
"video", "watch", "youtube"))
tokens <- comments_clean |>
select(comment_id, text) |>
unnest_tokens(word, text) |>
filter(str_detect(word, "^[a-z]+$")) |> # letters only
anti_join(stop_words, by = "word") |>
anti_join(custom_stops, by = "word") |>
filter(str_length(word) > 2)
word_freq <- tokens |>
count(word, sort = TRUE)
cat("Top 20 words:\n")
## Top 20 words:
print(head(word_freq, 20))
## # A tibble: 20 × 2
## word n
## <chr> <int>
## 1 mexico 31
## 2 mexican 13
## 3 game 12
## 4 viva 12
## 5 team 11
## 6 ochoa 10
## 7 memo 8
## 8 goal 5
## 9 love 5
## 10 beautiful 4
## 11 congrats 4
## 12 country 4
## 13 crazy 4
## 14 players 4
## 15 soccer 4
## 16 stadium 4
## 17 watching 4
## 18 winning 4
## 19 absolute 3
## 20 ball 3
word_freq |>
slice_max(n, n = 25) |>
mutate(word = reorder(word, n)) |>
ggplot(aes(x = n, y = word, fill = n)) +
geom_col(show.legend = FALSE) +
scale_fill_gradient(low = "#74c476", high = "#006d2c") +
labs(
title = "Top 25 Most Frequent Words in World Cup 2026 YouTube Comments",
subtitle = paste("Source video:", target_video_title),
x = "Frequency",
y = NULL,
caption = "Stop words and generic terms removed. Data collected via YouTube Data API v3."
) +
theme_minimal(base_size = 13)
wc_data <- word_freq |>
filter(n >= 2) |>
slice_max(n, n = 150)
wordcloud2(
data = wc_data,
size = 0.5,
color = "random-dark",
backgroundColor = "white"
)
cat("=== Dataset Summary ===\n")
## === Dataset Summary ===
cat("Total comments collected: ", nrow(comments_clean), "\n")
## Total comments collected: 99
cat("Unique commenters: ", n_distinct(comments_clean$author), "\n")
## Unique commenters: 95
cat("Date range: ",
format(min(comments_clean$published_at, na.rm = TRUE), "%Y-%m-%d"), "to",
format(max(comments_clean$published_at, na.rm = TRUE), "%Y-%m-%d"), "\n")
## Date range: 2026-06-25 to 2026-06-27
cat("Avg. likes per comment: ",
round(mean(comments_clean$like_count, na.rm = TRUE), 2), "\n")
## Avg. likes per comment: 0.73
cat("Total unique tokens: ", nrow(word_freq), "\n")
## Total unique tokens: 347
cat("Top 3 words: ",
paste(head(word_freq$word, 3), collapse = ", "), "\n")
## Top 3 words: mexico, mexican, game
The word frequency analysis reveals a pretty predictable rank and also one of the reason why the World Cup remains such an enormous event. Terms related to national identity (country names, team references) consistently rank among the highest-frequency words, reflecting the deeply nationalistic character of World Cup discourse. This aligns with Billings et al. (2018), who found that sports mega-events amplify in-group/out-group identity expression on social media platforms.
For this particular chosen video, words surround the player “Memo Ochoa” are prominent because this was his last game before fully retiring from the sport.