Važno: Ova analiza je sprovedena u cilju istraživanja propagandnih obrazaca i javnog diskursa, a ne targetiranja pojedinaca.
Principi: - Lični podaci (user name-ovi, telefoni, adrese) se ne objavljuju - Rezultati su agregirani; grupe su anonimizovane (npr. “Group A”, “Group B”) - Puni tekstovi poruka sa govorom mržnje se ne prikazuju u izlazu - Prikazani su samo anonimizovani isečci (20-30 reči) ili ključni termini - Javni kanali mogu biti identifikovani samo uz eksplicitnu dozvolu korisnika
# Funkcija za brzo učitavanje CSV-a
load_telegram_data <- function(path,
date_filter = NULL, # "YYYY-MM-DD" format
keyword_filter = NULL, # vektor ključnih reči
max_rows = NULL) { # limit broja redova
cat("Učitavanje podataka iz:", path, "\n")
# Provera postojanja fajla
if (!file.exists(path)) {
stop("CSV fajl nije pronađen na putanji: ", path)
}
# Učitavanje sa data.table::fread (brzo)
cat("Učitavanje CSV fajla...\n")
start_time <- Sys.time()
# Učitaj samo kolone koje su potrebne (optimizacija)
# Prvo pročitaj samo prvi red da vidimo kolone
sample <- data.table::fread(path, nrows = 1000)
col_names <- names(sample)
cat("Pronađene kolone:", paste(col_names, collapse = ", "), "\n")
# Mapiranje kolona (robustno)
col_mapping <- list(
group_id = c("group_id", "group_name", "channel", "group", "source"),
post_id = c("post_id", "id", "message_id", "msg_id"),
date_time = c("date_time", "timestamp", "date", "time", "created_at", "datetime"),
text = c("text", "message", "content", "body", "post_text"),
forwarded_from = c("forwarded_from", "forward_from", "forward_source"),
reply_to = c("reply_to", "reply_to_id", "reply"),
views = c("views", "view_count", "view"),
links = c("links", "urls", "link"),
media_type = c("media_type", "media", "attachment_type"),
sender = c("sender", "author", "user", "username", "from")
)
# Pronađi stvarne nazive kolona
actual_cols <- list()
for (var_name in names(col_mapping)) {
found <- intersect(col_mapping[[var_name]], col_names)
if (length(found) > 0) {
actual_cols[[var_name]] <- found[1]
cat("Mapirano:", var_name, "->", found[1], "\n")
} else {
actual_cols[[var_name]] <- NA
cat("Nije pronađeno:", var_name, "\n")
}
}
# Učitaj puni dataset
if (is.null(max_rows)) {
dt <- data.table::fread(path, encoding = "UTF-8", showProgress = TRUE)
} else {
dt <- data.table::fread(path, nrows = max_rows, encoding = "UTF-8", showProgress = TRUE)
}
# Preimenuj kolone na standardne nazive
for (var_name in names(actual_cols)) {
if (!is.na(actual_cols[[var_name]])) {
data.table::setnames(dt, actual_cols[[var_name]], var_name)
}
}
# Kreiraj standardne kolone ako ne postoje
if (!"group_id" %in% names(dt)) {
if ("group_name" %in% names(dt)) {
dt[, group_id := group_name]
} else {
dt[, group_id := paste0("Group_", seq_len(.N))]
}
}
if (!"post_id" %in% names(dt)) {
dt[, post_id := seq_len(.N)]
}
if (!"text" %in% names(dt)) {
stop("Kritična kolona 'text' nije pronađena!")
}
# Parsiranje datuma
if ("date_time" %in% names(dt)) {
# Pokušaj različite formate
dt[, date_time_parsed := as.POSIXct(NA)]
# Pokušaj različite formate
formats <- c(
"%Y-%m-%d %H:%M:%S",
"%Y-%m-%dT%H:%M:%S",
"%Y-%m-%d",
"%d.%m.%Y %H:%M:%S",
"%d/%m/%Y %H:%M:%S"
)
for (fmt in formats) {
parsed <- as.POSIXct(dt$date_time, format = fmt, tz = "Europe/Belgrade")
dt[is.na(date_time_parsed) & !is.na(parsed), date_time_parsed := parsed]
}
dt[, date := as.Date(date_time_parsed)]
} else {
dt[, date := Sys.Date()] # Fallback
dt[, date_time_parsed := as.POSIXct(Sys.time())]
}
# Filter po datumu
if (!is.null(date_filter)) {
filter_date <- as.Date(date_filter)
dt <- dt[date >= filter_date]
cat("Filtrirano po datumu >= ", date_filter, ": ", nrow(dt), " redova\n")
}
# Filter po ključnim rečima
if (!is.null(keyword_filter) && length(keyword_filter) > 0) {
pattern <- paste(keyword_filter, collapse = "|")
dt <- dt[grepl(pattern, text, ignore.case = TRUE)]
cat("Filtrirano po ključnim rečima: ", nrow(dt), " redova\n")
}
# Dodaj redni broj
dt[, row_id := seq_len(.N)]
end_time <- Sys.time()
cat("Učitano", nrow(dt), "redova za",
round(as.numeric(end_time - start_time, units = "secs"), 2), "sekundi\n")
return(dt)
}
# Učitaj podatke
# Opcija 1: Učitaj sve (može biti sporo za 302 MB)
# dt <- load_telegram_data(CSV_PATH)
# Opcija 2: Učitaj samo poslednjih N dana (brže za testiranje)
# dt <- load_telegram_data(CSV_PATH, date_filter = "2024-01-01")
# Opcija 3: Učitaj samo postove sa ključnim rečima
# dt <- load_telegram_data(CSV_PATH, keyword_filter = c("Kragujevac", "migrant", "protest"))
# Opcija 4: Učitaj ograničen broj redova za testiranje
# dt <- load_telegram_data(CSV_PATH, max_rows = 100000)
# TODO: Odkomentariši jednu od opcija iznad ili prilagodi parametre
# Za produkciju, koristi Opciju 1 ili 2
# Privremeno: učitaj sample za testiranje (promeni ovo!)
dt <- load_telegram_data(CSV_PATH, max_rows = 50000)## Učitavanje podataka iz: D:/MIGRANTI/telegram-migranti.csv
## Učitavanje CSV fajla...
## Pronađene kolone: id, date, message, editDate, views, forwards, messageLink, pinned, fwdFrom, viaBotId, postAuthor, replies, groupedId, reactions._👍, reactions._❤, reactions._🔥, reactions._🥰, post, entities, restrictionReason, channel, className, action, reactions._🙏, reactions._👏, reactions._💯, reactions._😁, reactions._🤡, reactions._🤣, reactions._🖕, reactions._🎉, reactions._🏆, reactions._👌, reactions, reactions._😢, groupedId.value, reactions._🙈, reactions._💊, reactions._🍾, reactions._🥱, reactions._👎, reactions._😈, reactions._😨, reactions._👀, reactions._🤬, reactions._🤔, reactions._🫡, reactions._🤩, reactions._😭, reactions._🤨, reactions._🍌, reactions._🤯, reactions._🤓, reactions._🤷♂, reactions._🤪, reactions._🕊, reactions._😡, reactions._🌭, reactions._❤🔥, reactions._⚡, reactions._🥴, fwdFrom.fwd_channel, fwdFrom.fwd_message, reactions._😱, reactions._🗿, reactions._🦄, reactions._🐳, reactions._🤝, reactions._🌚, reactions._😍, reactions._🙉, reactions._✍, reactions._🆒, reactions._😇, reactions._👻, reactions._😎, reactions._💅, reactions._🙊, reactions._👾, reactions._🤗, reactions._🤷, reactions._😐, reactions._🎃, reactions._🤷♀, reactions._😴, reactions._☃, reactions._🎄, reactions._😘, reactions._👨💻, reactions._🎅, reactions._💔, reactions._💋, reactions._🍓, reactions._💘, reactions._💩, reactions._🤮, viaBotId.value
## Mapirano: group_id -> channel
## Mapirano: post_id -> id
## Mapirano: date_time -> date
## Mapirano: text -> message
## Nije pronađeno: forwarded_from
## Nije pronađeno: reply_to
## Mapirano: views -> views
## Nije pronađeno: links
## Nije pronađeno: media_type
## Nije pronađeno: sender
## Učitano 50000 redova za 0.79 sekundi
##
## === PREGLED DATASETA ===
## Broj redova: 50000
## Broj kolona: 100
## Kolone: post_id, date_time, text, editDate, views, forwards, messageLink, pinned, fwdFrom, viaBotId, postAuthor, replies, groupedId, reactions._👍, reactions._❤, reactions._🔥, reactions._🥰, post, entities, restrictionReason, group_id, className, action, reactions._🙏, reactions._👏, reactions._💯, reactions._😁, reactions._🤡, reactions._🤣, reactions._🖕, reactions._🎉, reactions._🏆, reactions._👌, reactions, reactions._😢, groupedId.value, reactions._🙈, reactions._💊, reactions._🍾, reactions._🥱, reactions._👎, reactions._😈, reactions._😨, reactions._👀, reactions._🤬, reactions._🤔, reactions._🫡, reactions._🤩, reactions._😭, reactions._🤨, reactions._🍌, reactions._🤯, reactions._🤓, reactions._🤷♂, reactions._🤪, reactions._🕊, reactions._😡, reactions._🌭, reactions._❤🔥, reactions._⚡, reactions._🥴, fwdFrom.fwd_channel, fwdFrom.fwd_message, reactions._😱, reactions._🗿, reactions._🦄, reactions._🐳, reactions._🤝, reactions._🌚, reactions._😍, reactions._🙉, reactions._✍, reactions._🆒, reactions._😇, reactions._👻, reactions._😎, reactions._💅, reactions._🙊, reactions._👾, reactions._🤗, reactions._🤷, reactions._😐, reactions._🎃, reactions._🤷♀, reactions._😴, reactions._☃, reactions._🎄, reactions._😘, reactions._👨💻, reactions._🎅, reactions._💔, reactions._💋, reactions._🍓, reactions._💘, reactions._💩, reactions._🤮, viaBotId.value, date_time_parsed, date, row_id
cat("Vremenski raspon:",
if ("date" %in% names(dt)) paste(range(dt$date, na.rm = TRUE), collapse = " - ") else "N/A",
"\n")## Vremenski raspon: 2021-09-24 - 2025-12-12
# Funkcija za čišćenje teksta
clean_text <- function(text) {
if (is.null(text) || length(text) == 0) return(character(0))
text <- as.character(text)
# Ukloni NA
text[is.na(text)] <- ""
# Normalizacija latinice/ćirilice (osnovna)
# TODO: Dodaj kompletnu transliteraciju ako je potrebno
# Ukloni URL-ove
text <- str_replace_all(text, "https?://[\\S]+", " ")
text <- str_replace_all(text, "www\\.[\\S]+", " ")
# Ukloni email adrese
text <- str_replace_all(text, "[\\S]+@[\\S]+", " ")
# Ukloni telefone (osnovni pattern)
text <- str_replace_all(text, "\\+?[0-9]{8,}", " ")
# Ukloni višestruke razmake
text <- str_replace_all(text, "\\s+", " ")
# Trim
text <- str_trim(text)
return(text)
}
# Čišćenje teksta
dt[, text_clean := clean_text(text)]
# Ukloni prazne postove
dt <- dt[text_clean != "" & nchar(text_clean) > 5]
# Dodaj dodatne kolone
dt[, text_length := nchar(text_clean)]
dt[, word_count := str_count(text_clean, "\\S+")]
# Anonimizacija grupa (osim ako su javni kanali)
if ("group_id" %in% names(dt)) {
# Kreiraj anonimizovane nazive
unique_groups <- unique(dt$group_id)
group_mapping <- data.table(
group_id = unique_groups,
group_anon = paste0("Group_", LETTERS[seq_along(unique_groups)])
)
dt <- data.table::merge.data.table(dt, group_mapping, by = "group_id", all.x = TRUE)
} else {
dt[, group_anon := "Group_A"]
}
cat("Očišćeno:", nrow(dt), "postova\n")## Očišćeno: 34609 postova
## Broj grupa: 3
## === OPIS DATASETA ===
## Ukupan broj postova: 34609
## Broj jedinstvenih grupa: 3
if ("date" %in% names(dt)) {
cat("Vremenski raspon:",
min(dt$date, na.rm = TRUE), "do",
max(dt$date, na.rm = TRUE), "\n")
}## Vremenski raspon: 18894 do 20434
# Broj postova po grupama
posts_by_group <- dt[, .N, by = group_anon][order(-N)]
print(posts_by_group)## group_anon N
## <char> <int>
## 1: Group_A 24233
## 2: Group_C 7576
## 3: Group_B 2800
# Trend postova po danu
if ("date" %in% names(dt)) {
posts_by_date <- dt[!is.na(date), .N, by = date][order(date)]
p1 <- ggplot(posts_by_date, aes(x = date, y = N)) +
geom_line(color = "steelblue", linewidth = 1) +
geom_point(color = "steelblue", alpha = 0.6) +
labs(
title = "Broj postova po danu",
x = "Datum",
y = "Broj postova"
) +
theme_minimal() +
theme(plot.title = element_text(size = 14, face = "bold"))
print(p1)
# Trend po nedelji
dt[!is.na(date), week := lubridate::floor_date(date, "week")]
posts_by_week <- dt[!is.na(week), .N, by = week][order(week)]
p2 <- ggplot(posts_by_week, aes(x = week, y = N)) +
geom_line(color = "darkgreen", linewidth = 1) +
geom_point(color = "darkgreen", alpha = 0.6) +
labs(
title = "Broj postova po nedelji",
x = "Nedelja",
y = "Broj postova"
) +
theme_minimal() +
theme(plot.title = element_text(size = 14, face = "bold"))
print(p2)
}# Lista ključnih reči za Kragujevac
kragujevac_keywords <- c(
"Kragujevac", "Kragujevcu", "Kragujevca", "Kragujevcem",
"KG", "kg", "K.G.",
"Šumadija", "Šumadije", "Šumadiji", "Šumadijom",
"Stanovo", "Stanova",
"Aerodrom", "Aerodroma",
"Centar", "Centru", "Centra",
"Kragujevčani", "Kragujevčanima"
)
# TODO: Dodaj dodatne lokacije/pojmove ako je potrebno
# Detekcija pominjanja Kragujevca
detect_kragujevac <- function(text, keywords = kragujevac_keywords) {
pattern <- paste(keywords, collapse = "|")
grepl(pattern, text, ignore.case = TRUE)
}
dt[, mentions_kg := grepl(paste(kragujevac_keywords, collapse = "|"),
text_clean, ignore.case = TRUE)]
cat("=== KRAGUJEVAC MENTIONS ===\n")## === KRAGUJEVAC MENTIONS ===
## Broj postova koji pominju Kragujevac: 37
## Procenat: 0.11 %
# Trend pominjanja kroz vreme
if ("date" %in% names(dt)) {
kg_by_date <- dt[!is.na(date) & mentions_kg == TRUE, .N, by = date][order(date)]
if (nrow(kg_by_date) > 0) {
p3 <- ggplot(kg_by_date, aes(x = date, y = N)) +
geom_line(color = "red", linewidth = 1) +
geom_point(color = "red", alpha = 0.6) +
labs(
title = "Pominjanje Kragujevca kroz vreme",
x = "Datum",
y = "Broj postova"
) +
theme_minimal() +
theme(plot.title = element_text(size = 14, face = "bold"))
print(p3)
}
}# Pominjanje po grupama
kg_by_group <- dt[mentions_kg == TRUE, .N, by = group_anon][order(-N)]
if (nrow(kg_by_group) > 0) {
cat("\nPominjanje Kragujevca po grupama:\n")
print(kg_by_group)
p4 <- ggplot(kg_by_group, aes(x = reorder(group_anon, N), y = N)) +
geom_bar(stat = "identity", fill = "coral") +
coord_flip() +
labs(
title = "Pominjanje Kragujevca po grupama",
x = "Grupa",
y = "Broj postova"
) +
theme_minimal() +
theme(plot.title = element_text(size = 14, face = "bold"))
print(p4)
}##
## Pominjanje Kragujevca po grupama:
## group_anon N
## <char> <int>
## 1: Group_A 19
## 2: Group_C 11
## 3: Group_B 7
# Rečnik CTA fraza
cta_phrases <- c(
"okupljanje", "okupljanja", "okupljanju",
"protest", "protesta", "protestu", "proteste",
"večeras u", "večeras u", "sutra u", "sutra u",
"ponesite", "ponesite", "ponesite",
"blokada", "blokade", "blokadi",
"skup", "skupa", "skupu", "skupovi",
"dođite", "dođi", "dođimo", "dođite",
"share", "delite", "podelite", "podeli",
"hitno", "hitno", "urgentno",
"mobilizacija", "mobilizacije", "mobilizaciji",
"u \\d+ sati", "u \\d+:\\d+", # vreme
"ispred", "ispred",
"kod", "kod",
"na trgu", "na trgu",
"ispred opštine", "ispred opštine",
"sastanak", "sastanka", "sastanku",
"demonstracije", "demonstracija",
"marš", "marša", "maršu"
)
# TODO: Dodaj dodatne CTA fraze ako je potrebno
# Funkcija za izračunavanje CTA score-a
calculate_cta_score <- function(text) {
if (is.na(text) || text == "") return(0)
score <- 0
# Broj CTA fraza
for (phrase in cta_phrases) {
matches <- str_count(text, regex(phrase, ignore_case = TRUE))
score <- score + matches
}
# Detekcija vremena (pattern: "u 18:00", "u 18 sati", "sutra u 20")
time_patterns <- c(
"u \\d{1,2}:\\d{2}",
"u \\d{1,2} sati",
"u \\d{1,2}h"
)
for (pattern in time_patterns) {
if (grepl(pattern, text, ignore.case = TRUE)) {
score <- score + 2
}
}
# Detekcija datuma (pattern: "sutra", "danas", "večeras")
date_words <- c("sutra", "danas", "večeras", "prekosutra")
for (word in date_words) {
if (grepl(paste0("\\b", word, "\\b"), text, ignore.case = TRUE)) {
score <- score + 1
}
}
# Detekcija lokacije (pattern: "ispred", "kod", "na")
location_words <- c("ispred", "kod", "na trgu", "na", "u")
location_count <- sum(sapply(location_words, function(w) {
grepl(paste0("\\b", w, "\\b"), text, ignore.case = TRUE)
}))
if (location_count > 0) {
score <- score + min(location_count, 3)
}
# Uzbunjivanje (ALL CAPS, uzvičnici)
caps_ratio <- str_count(text, "[A-ZА-Я]") / max(nchar(text), 1)
if (caps_ratio > 0.3) {
score <- score + 1
}
exclamation_count <- str_count(text, "!")
if (exclamation_count > 2) {
score <- score + min(exclamation_count - 2, 3)
}
return(score)
}
# Izračunaj CTA score
dt[, cta_score := sapply(text_clean, calculate_cta_score)]
# Flag za "event-like" postove
dt[, flag_cta := cta_score >= 3] # Threshold može se prilagoditi
cat("=== CTA DETEKCIJA ===\n")## === CTA DETEKCIJA ===
## Broj postova sa CTA score >= 3: 189
## Prosečan CTA score: 0.04
# TOP CTA postovi (bez punog teksta - samo anonimizovani isečci)
top_cta <- dt[flag_cta == TRUE][order(-cta_score)][1:min(20, sum(dt$flag_cta))]
top_cta[, text_preview := substr(text_clean, 1, 100)] # Prvih 100 karaktera
cat("\nTOP 10 CTA postova (preview):\n")##
## TOP 10 CTA postova (preview):
## group_anon date cta_score
## <char> <Date> <num>
## 1: Group_A 2022-10-16 9
## 2: Group_A 2025-03-04 6
## 3: Group_B 2024-07-10 6
## 4: Group_A 2025-08-13 5
## 5: Group_A 2025-06-21 5
## 6: Group_A 2025-05-23 5
## 7: Group_A 2025-04-26 5
## 8: Group_C 2025-05-05 4
## 9: Group_C 2025-02-26 4
## 10: Group_C 2025-01-19 4
## 11: Group_A 2024-10-01 4
## 12: Group_A 2024-05-11 4
## 13: Group_A 2023-11-16 4
## 14: Group_A 2023-11-15 4
## 15: Group_A 2023-10-03 4
## 16: Group_A 2023-07-02 4
## 17: Group_B 2025-08-13 4
## 18: Group_B 2025-04-01 4
## 19: Group_B 2024-05-28 4
## 20: Group_B 2023-11-08 4
## group_anon date cta_score
## text_preview
## <char>
## 1: 💥Данас је одржан још један масовни скуп у Молдавији, а увече је ”демократска” председница Маја Санду
## 2: ⚡️⚡️⚡️⚡️⚡️⚡️ NAJVECA GRESKA koju je ""kolektivni zapad"" uradio u 21. veku je sto je naterao ruske n
## 3: SUTRA BRANIMO USTAVNI POREDAK ISPRED USTAVNOG SUDA, U 8 UJUTRO ‼️ Ustavni sud nije zaštitio državno
## 4: 🌍 Pridružite se kanalu Stevan II, koji vodi čovek iz Rusije ! 🇷🇺 Geopolitika, vesti, analize i malo
## 5: 🌍 Pridružite se kanalu Stevan II, koji vodi čovek iz Rusije ! 🇷🇺 Geopolitika, vesti, analize i malo
## 6: 🌍 Pridružite se kanalu Stevan II, koji vodi čovek iz Rusije ! 🇷🇺 Geopolitika, vesti, analize i malo
## 7: 🌍 Pridružite se kanalu Stevan II, koji vodi čovek iz Rusije ! 🇷🇺 Geopolitika, vesti, analize i malo
## 8: 🇭🇺🇪🇺🇺🇸 Tramp predložio Orbanu da Mađarska izađe iz Evropske unije 💬 Mađarski premijer Viktor Orban i
## 9: Ајмо браћо и сестре да помогнемо, Срби су вазда били сложни за овакве ствари време је за нову мини б
## 10: 🇷🇸🚨 Beograd - Protest studenata ispred Ministarstva prosvete 🔗 Čitajte i zapratite nas: Nulta Tačka
## 11: 📌КОЛАПС ИЗРАЕЛСКЕ ОДБРАНЕ!!!!!📌 🇷🇸#Српски 🇷🇸 #Сербский ⚡️⚡️⚡️⚡️⚡️👇👇👇 📱 InfoDefenseSERBIA 📱 InfoDefen
## 12: 🔈 Nulta Tačka Vaš SIGURNI izvor najnovijih informacija kojih nema u mejnstrim medijima 🖥 Nulta Tačka
## 13: ⭕️ Nulta Tačka Vaš SIGURNI izvor najnovijih informacija kojih nema u mejnstrim medijima ⭕️ 👨💻 Porta
## 14: ⭕️ Nulta Tačka Vaš SIGURNI izvor najnovijih informacija kojih nema u mejnstrim medijima ⭕️ 👨💻 Porta
## 15: Dobar dan Bilja. Htelabi Vas zamoliti, da podelite link kanala Srpskih dobrovoljaca u Rusiji.
## 16: Јуриј Подољака преноси извештај @wargonzo ⚡️Ситуација у правцу Орехов-Работино u 13:00 по московском
## 17: Migrant kod ekonomskog divlja nozem i udara po drvecu u parku, plasi prolaznike
## 18: 🎥 SNS degenerik koji danas zamalo nije pregazio studentkinju kod FON a.
## 19: 🇷🇸 Dr James Thorp kaže da je od Covid-19 ""vakcina"" do sada ubijeno ili teško povređeno više od 585
## 20: PODELITE! DA NAROD VIDI ISTINU! Priština: Vučićeva Srpska lista položila zakletvu Kosovu, pa odala p
## text_preview
# Trend CTA score kroz vreme
cta_by_date <- data.table() # Inicijalizacija
if ("date" %in% names(dt)) {
cta_by_date <- dt[!is.na(date), .(avg_score = mean(cta_score, na.rm = TRUE),
count = sum(flag_cta, na.rm = TRUE)),
by = date][order(date)]
if (nrow(cta_by_date) > 0) {
p5 <- ggplot(cta_by_date, aes(x = date)) +
geom_line(aes(y = avg_score), color = "blue", linewidth = 1) +
geom_line(aes(y = count / max(count, na.rm = TRUE) * max(avg_score, na.rm = TRUE)),
color = "red", linewidth = 1, linetype = "dashed") +
scale_y_continuous(
name = "Prosečan CTA score",
sec.axis = sec_axis(~ . / max(cta_by_date$avg_score, na.rm = TRUE) * max(cta_by_date$count, na.rm = TRUE),
name = "Broj CTA postova")
) +
labs(
title = "Trend CTA score i broj CTA postova",
x = "Datum"
) +
theme_minimal() +
theme(plot.title = element_text(size = 14, face = "bold"))
print(p5)
}
}# Identifikacija "talasa" (pikovi)
if ("date" %in% names(dt) && nrow(cta_by_date) > 0) {
# Pronađi dane sa iznadprosečnim CTA aktivnostima
threshold <- quantile(cta_by_date$count, 0.9, na.rm = TRUE)
peaks <- cta_by_date[count >= threshold]
cat("\n=== TOP TALASI (Pikovi) ===\n")
if (nrow(peaks) > 0) {
print(peaks[order(-count)])
}
}##
## === TOP TALASI (Pikovi) ===
## date avg_score count
## <Date> <num> <int>
## 1: 2022-10-13 0.2307692 6
## 2: 2025-06-15 0.1323529 3
## 3: 2022-10-14 0.2800000 2
## 4: 2022-10-16 2.1428571 2
## 5: 2023-10-09 0.1944444 2
## ---
## 154: 2025-11-29 0.1111111 1
## 155: 2025-11-30 0.1304348 1
## 156: 2025-12-07 0.1153846 1
## 157: 2025-12-08 0.1034483 1
## 158: 2025-12-11 0.1666667 1
# Priprema korpusa za quanteda
corpus <- quanteda::corpus(dt$text_clean, docvars = data.frame(
group = dt$group_anon,
date = dt$date,
cta_score = dt$cta_score,
mentions_kg = dt$mentions_kg
))
# Tokenizacija
tokens_obj <- quanteda::tokens(
corpus,
remove_punct = TRUE,
remove_numbers = TRUE,
remove_symbols = TRUE,
remove_url = TRUE
)
# Ukloni stopwords (srpski + engleski)
# Pokušaj da učitaš srpski stopwords, ako ne postoji koristi samo engleski
sr_stopwords <- tryCatch({
quanteda::stopwords("sr")
}, error = function(e) {
character(0)
})
custom_stopwords <- c(
sr_stopwords,
quanteda::stopwords("en"),
"ovo", "to", "što", "šta", "koji", "koja", "koje",
"je", "su", "bi", "će", "ćemo", "ćete", "ćeš", "ću",
"sam", "si", "smo", "ste", "su", "jesam", "jesi", "jesmo", "jeste", "jesu",
"ću", "ćeš", "će", "ćemo", "ćete", "će",
"sam", "si", "je", "smo", "ste", "su",
"bio", "bila", "bilo", "bili", "bile",
"biti", "bit", "bi", "u", "na", "i", "su", "se", "nas", "i", "je", "za", "da"
)
tokens_obj <- quanteda::tokens_remove(tokens_obj, pattern = custom_stopwords)
# Kreiraj DFM
dfm_obj <- quanteda::dfm(tokens_obj)
# Top termini
top_terms <- quanteda::topfeatures(dfm_obj, n = 50)
cat("=== TOP 50 TERMINA ===\n")## === TOP 50 TERMINA ===
## ⚡️ у и је
## 80897 73890 73498 70150
## 🇷🇸 да на се
## 57984 55995 42293 38573
## су за infodefenseserbia од
## 33420 24233 21568 18659
## #сербский #српски infodefense са
## 18519 18511 17694 15916
## нас не а ће
## 14481 11058 10537 10094
# Bigrami
tokens_bigrams <- quanteda::tokens_ngrams(tokens_obj, n = 2)
dfm_bigrams <- quanteda::dfm(tokens_bigrams)
top_bigrams <- quanteda::topfeatures(dfm_bigrams, n = 30)
cat("\n=== TOP 30 BIGRAMA ===\n")##
## === TOP 30 BIGRAMA ===
## ⚡️_⚡️ 🇷🇸_#сербский
## 56105 18516
## #српски_🇷🇸 🇷🇸_#српски
## 18508 18372
## infodefenseserbia_infodefense 🇷🇸_🇷🇸
## 17461 10845
## нас_infodefenseserbia пратите_нас
## 9854 9500
## #сербский_пратите ⚡️_infodefenseserbia
## 9301 9064
## #сербский_⚡️ да_се
## 9054 7294
## оружаних_снага t.me_buntcg
## 7165 6655
## да_је
## 6011
# TF-IDF po grupama
dfm_grouped <- quanteda::dfm_group(dfm_obj, groups = quanteda::docvars(corpus, "group"))
tfidf_grouped <- quanteda::dfm_tfidf(dfm_grouped)
# Top TF-IDF termini po grupama
cat("\n=== TOP TF-IDF TERMINI PO GRUPAMA ===\n")##
## === TOP TF-IDF TERMINI PO GRUPAMA ===
for (grp in quanteda::docnames(tfidf_grouped)) {
top_tfidf <- quanteda::topfeatures(tfidf_grouped[grp,], n = 10)
cat("\n", grp, ":\n")
print(top_tfidf)
}##
## Group_A :
## #сербский #српски infodefense палчеве подољака осу
## 3260.6818 3259.2731 3115.4066 809.1976 645.0679 633.1399
## осташко чат-бот infodefall рввоенкор
## 585.9049 535.1413 532.5000 520.5393
##
## Group_B :
## narodnapatrola снс principshop србин.инфо суботице
## 215.359610 26.765871 22.187499 10.973789 9.332837
## белграде дамњана русов paladins subscribe
## 9.065304 8.100198 7.633940 7.156819 6.679698
##
## Group_C :
## 🇮🇷 🇮🇱 narodnapatrola шиптари снс
## 27.11805 26.23760 18.31349 17.43303 16.90476
## шиптарски приштини иран митровица митровици
## 16.90476 16.02430 14.79167 12.32639 11.97421
##
## === TOPIC MODELING ===
## Priprema podataka za topic modeling...
# Konvertuj DFM u format za STM
dfm_stm <- quanteda::convert(dfm_obj, to = "stm")
# Odredi optimalan broj tema (može se prilagoditi)
K <- 10 # TODO: Prilagodi broj tema (8-15)
# STM model
stm_model <- stm::stm(
documents = dfm_stm$documents,
vocab = dfm_stm$vocab,
K = K,
data = dfm_stm$meta,
init.type = "Spectral",
seed = 12345
)## Beginning Spectral Initialization
## Calculating the gram matrix...
## Using only 10000 most frequent terms during initialization...
## Finding anchor words...
## ..........
## Recovering initialization...
## ....................................................................................................
## Initialization complete.
## ....................................................................................................
## Completed E-Step (8 seconds).
## Completed M-Step.
## Completing Iteration 1 (approx. per word bound = -8.574)
## ....................................................................................................
## Completed E-Step (7 seconds).
## Completed M-Step.
## Completing Iteration 2 (approx. per word bound = -7.853, relative change = 8.408e-02)
## ....................................................................................................
## Completed E-Step (7 seconds).
## Completed M-Step.
## Completing Iteration 3 (approx. per word bound = -7.737, relative change = 1.477e-02)
## ....................................................................................................
## Completed E-Step (7 seconds).
## Completed M-Step.
## Completing Iteration 4 (approx. per word bound = -7.652, relative change = 1.096e-02)
## ....................................................................................................
## Completed E-Step (7 seconds).
## Completed M-Step.
## Completing Iteration 5 (approx. per word bound = -7.601, relative change = 6.727e-03)
## Topic 1: и, на, се, у, су
## Topic 2: infodefenseserbia, infodefense, #српски, је, и
## Topic 3: у, t.me, је, да, 🇷🇸
## Topic 4: је, да, и, у, за
## Topic 5: и, је, да, се, на
## Topic 6: у, су, је, 🇷🇸, од
## Topic 7: у, 🇷🇸, на, #сербский, нас
## Topic 8: да, на, се, у, за
## Topic 9: ⚡️, у, је, и, да
## Topic 10: 🇷🇸, и, да, се, 🇷🇺
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 6 (approx. per word bound = -7.569, relative change = 4.168e-03)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 7 (approx. per word bound = -7.550, relative change = 2.584e-03)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 8 (approx. per word bound = -7.536, relative change = 1.749e-03)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 9 (approx. per word bound = -7.527, relative change = 1.304e-03)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 10 (approx. per word bound = -7.519, relative change = 1.051e-03)
## Topic 1: и, у, се, на, да
## Topic 2: infodefenseserbia, infodefense, #српски, #сербский, је
## Topic 3: у, t.me, је, 🇷🇸, на
## Topic 4: је, да, и, у, се
## Topic 5: и, је, да, се, у
## Topic 6: су, је, у, од, 🇷🇸
## Topic 7: 🇷🇸, на, у, нас, и
## Topic 8: да, на, у, се, и
## Topic 9: ⚡️, у, 🇷🇸, и, 🇷🇺
## Topic 10: 🇷🇸, и, ❤️, нас, истина
## ....................................................................................................
## Completed E-Step (7 seconds).
## Completed M-Step.
## Completing Iteration 11 (approx. per word bound = -7.512, relative change = 9.022e-04)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 12 (approx. per word bound = -7.506, relative change = 8.017e-04)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 13 (approx. per word bound = -7.501, relative change = 7.003e-04)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 14 (approx. per word bound = -7.496, relative change = 5.919e-04)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 15 (approx. per word bound = -7.492, relative change = 5.013e-04)
## Topic 1: у, и, се, на, да
## Topic 2: infodefenseserbia, infodefense, #српски, #сербский, пратите
## Topic 3: у, t.me, је, buntcg, 🇷🇸
## Topic 4: је, да, и, у, се
## Topic 5: и, је, да, у, се
## Topic 6: је, су, у, од, за
## Topic 7: 🇷🇸, на, у, и, нас
## Topic 8: да, у, и, на, се
## Topic 9: ⚡️, 🇷🇸, 🇷🇺, у, и
## Topic 10: 🇷🇸, и, нас, ❤️, запратите
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 16 (approx. per word bound = -7.489, relative change = 4.405e-04)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 17 (approx. per word bound = -7.486, relative change = 4.018e-04)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 18 (approx. per word bound = -7.483, relative change = 3.596e-04)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 19 (approx. per word bound = -7.481, relative change = 3.152e-04)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 20 (approx. per word bound = -7.479, relative change = 2.861e-04)
## Topic 1: у, и, на, се, да
## Topic 2: infodefenseserbia, #српски, infodefense, #сербский, пратите
## Topic 3: у, t.me, је, buntcg, 🇷🇸
## Topic 4: да, је, и, у, се
## Topic 5: и, је, да, у, се
## Topic 6: је, у, су, од, за
## Topic 7: 🇷🇸, на, у, и, нас
## Topic 8: да, у, и, на, је
## Topic 9: ⚡️, 🇷🇸, 🇷🇺, у, infodefenseserbia
## Topic 10: 🇷🇸, нас, и, ❤️, запратите
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 21 (approx. per word bound = -7.477, relative change = 2.666e-04)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 22 (approx. per word bound = -7.475, relative change = 2.518e-04)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 23 (approx. per word bound = -7.473, relative change = 2.473e-04)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 24 (approx. per word bound = -7.471, relative change = 2.464e-04)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 25 (approx. per word bound = -7.470, relative change = 2.489e-04)
## Topic 1: у, и, на, се, су
## Topic 2: infodefenseserbia, #српски, #сербский, infodefense, 🇷🇸
## Topic 3: у, t.me, buntcg, је, 🇷🇸
## Topic 4: да, је, и, у, се
## Topic 5: и, је, да, у, се
## Topic 6: је, у, су, од, и
## Topic 7: на, 🇷🇸, у, и, су
## Topic 8: да, у, и, је, на
## Topic 9: ⚡️, 🇷🇸, 🇷🇺, infodefenseserbia, и
## Topic 10: 🇷🇸, и, ❤️, нас, запратите
## ....................................................................................................
## Completed E-Step (7 seconds).
## Completed M-Step.
## Completing Iteration 26 (approx. per word bound = -7.468, relative change = 2.557e-04)
## ....................................................................................................
## Completed E-Step (7 seconds).
## Completed M-Step.
## Completing Iteration 27 (approx. per word bound = -7.466, relative change = 2.626e-04)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 28 (approx. per word bound = -7.464, relative change = 2.634e-04)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 29 (approx. per word bound = -7.462, relative change = 2.488e-04)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 30 (approx. per word bound = -7.460, relative change = 2.161e-04)
## Topic 1: у, и, на, се, су
## Topic 2: 🇷🇸, #српски, #сербский, infodefenseserbia, infodefense
## Topic 3: t.me, у, buntcg, 🇷🇸, је
## Topic 4: да, је, и, се, у
## Topic 5: и, је, у, да, се
## Topic 6: је, у, су, и, од
## Topic 7: у, на, и, су, је
## Topic 8: да, у, је, и, на
## Topic 9: ⚡️, 🇷🇸, 🇷🇺, infodefenseserbia, и
## Topic 10: 🇷🇸, ❤️, и, запратите, чат-бот
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 31 (approx. per word bound = -7.459, relative change = 1.811e-04)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 32 (approx. per word bound = -7.458, relative change = 1.505e-04)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 33 (approx. per word bound = -7.457, relative change = 1.322e-04)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 34 (approx. per word bound = -7.456, relative change = 1.174e-04)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 35 (approx. per word bound = -7.455, relative change = 9.754e-05)
## Topic 1: у, и, на, се, су
## Topic 2: 🇷🇸, #српски, #сербский, infodefenseserbia, infodefense
## Topic 3: t.me, у, buntcg, 🇷🇸, је
## Topic 4: да, је, и, се, у
## Topic 5: и, је, у, да, се
## Topic 6: је, у, су, на, и
## Topic 7: у, и, на, су, је
## Topic 8: да, је, у, и, на
## Topic 9: ⚡️, 🇷🇸, 🇷🇺, и, infodefenseserbia
## Topic 10: 🇷🇸, ❤️, и, запратите, чат-бот
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 36 (approx. per word bound = -7.455, relative change = 8.653e-05)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 37 (approx. per word bound = -7.454, relative change = 7.670e-05)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 38 (approx. per word bound = -7.453, relative change = 6.629e-05)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 39 (approx. per word bound = -7.453, relative change = 6.076e-05)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 40 (approx. per word bound = -7.453, relative change = 5.765e-05)
## Topic 1: у, и, на, се, су
## Topic 2: 🇷🇸, #српски, #сербский, infodefenseserbia, infodefense
## Topic 3: t.me, у, buntcg, 🇷🇸, и
## Topic 4: да, је, и, се, у
## Topic 5: и, је, у, да, се
## Topic 6: је, у, су, на, и
## Topic 7: у, и, на, је, су
## Topic 8: је, да, у, и, на
## Topic 9: ⚡️, 🇷🇸, 🇷🇺, и, infodefenseserbia
## Topic 10: 🇷🇸, ❤️, и, запратите, чат-бот
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 41 (approx. per word bound = -7.452, relative change = 5.435e-05)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 42 (approx. per word bound = -7.452, relative change = 4.779e-05)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 43 (approx. per word bound = -7.451, relative change = 4.647e-05)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 44 (approx. per word bound = -7.451, relative change = 4.384e-05)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 45 (approx. per word bound = -7.451, relative change = 4.200e-05)
## Topic 1: у, и, на, су, се
## Topic 2: 🇷🇸, #српски, #сербский, infodefenseserbia, infodefense
## Topic 3: t.me, у, buntcg, 🇷🇸, и
## Topic 4: да, је, и, се, у
## Topic 5: и, је, у, да, се
## Topic 6: је, у, су, на, и
## Topic 7: у, и, је, на, су
## Topic 8: је, да, у, и, на
## Topic 9: ⚡️, 🇷🇸, 🇷🇺, и, infodefenseserbia
## Topic 10: 🇷🇸, ❤️, и, запратите, чат-бот
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 46 (approx. per word bound = -7.451, relative change = 4.196e-05)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 47 (approx. per word bound = -7.450, relative change = 4.018e-05)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 48 (approx. per word bound = -7.450, relative change = 3.912e-05)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 49 (approx. per word bound = -7.450, relative change = 3.738e-05)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 50 (approx. per word bound = -7.449, relative change = 3.579e-05)
## Topic 1: у, и, на, су, се
## Topic 2: 🇷🇸, #сербский, #српски, infodefenseserbia, infodefense
## Topic 3: t.me, у, 🇷🇸, buntcg, и
## Topic 4: да, је, и, се, у
## Topic 5: и, је, у, да, се
## Topic 6: је, у, су, на, и
## Topic 7: у, и, је, на, су
## Topic 8: је, да, у, и, на
## Topic 9: ⚡️, 🇷🇸, 🇷🇺, и, infodefenseserbia
## Topic 10: 🇷🇸, ❤️, и, запратите, чат-бот
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 51 (approx. per word bound = -7.449, relative change = 3.285e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 52 (approx. per word bound = -7.449, relative change = 3.112e-05)
## ....................................................................................................
## Completed E-Step (6 seconds).
## Completed M-Step.
## Completing Iteration 53 (approx. per word bound = -7.449, relative change = 3.062e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 54 (approx. per word bound = -7.448, relative change = 2.995e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 55 (approx. per word bound = -7.448, relative change = 2.788e-05)
## Topic 1: у, и, на, су, се
## Topic 2: 🇷🇸, #сербский, #српски, infodefenseserbia, infodefense
## Topic 3: t.me, у, 🇷🇸, buntcg, и
## Topic 4: да, је, и, се, у
## Topic 5: и, је, у, да, се
## Topic 6: је, у, су, на, и
## Topic 7: у, и, је, на, су
## Topic 8: је, да, у, и, на
## Topic 9: ⚡️, 🇷🇸, 🇷🇺, и, infodefenseserbia
## Topic 10: 🇷🇸, ❤️, и, запратите, чат-бот
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 56 (approx. per word bound = -7.448, relative change = 2.699e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 57 (approx. per word bound = -7.448, relative change = 2.545e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 58 (approx. per word bound = -7.448, relative change = 2.478e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 59 (approx. per word bound = -7.447, relative change = 2.457e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 60 (approx. per word bound = -7.447, relative change = 2.332e-05)
## Topic 1: у, и, на, су, се
## Topic 2: 🇷🇸, #сербский, #српски, infodefenseserbia, infodefense
## Topic 3: t.me, у, 🇷🇸, buntcg, и
## Topic 4: да, је, и, се, у
## Topic 5: и, је, у, да, се
## Topic 6: је, у, су, на, и
## Topic 7: у, и, је, на, су
## Topic 8: је, да, у, и, на
## Topic 9: ⚡️, 🇷🇸, 🇷🇺, и, infodefenseserbia
## Topic 10: 🇷🇸, ❤️, и, запратите, чат-бот
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 61 (approx. per word bound = -7.447, relative change = 2.124e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 62 (approx. per word bound = -7.447, relative change = 2.013e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 63 (approx. per word bound = -7.447, relative change = 1.982e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 64 (approx. per word bound = -7.447, relative change = 1.934e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 65 (approx. per word bound = -7.447, relative change = 1.828e-05)
## Topic 1: у, и, на, су, се
## Topic 2: 🇷🇸, #сербский, #српски, infodefenseserbia, infodefense
## Topic 3: t.me, у, 🇷🇸, buntcg, и
## Topic 4: да, је, и, се, у
## Topic 5: и, је, у, да, се
## Topic 6: је, у, су, на, и
## Topic 7: у, и, је, су, на
## Topic 8: је, да, у, и, на
## Topic 9: ⚡️, 🇷🇸, 🇷🇺, и, infodefenseserbia
## Topic 10: 🇷🇸, ❤️, и, запратите, чат-бот
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 66 (approx. per word bound = -7.446, relative change = 1.708e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 67 (approx. per word bound = -7.446, relative change = 1.629e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 68 (approx. per word bound = -7.446, relative change = 1.663e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 69 (approx. per word bound = -7.446, relative change = 1.701e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 70 (approx. per word bound = -7.446, relative change = 1.690e-05)
## Topic 1: у, и, на, су, се
## Topic 2: 🇷🇸, #сербский, #српски, infodefenseserbia, infodefense
## Topic 3: t.me, у, 🇷🇸, buntcg, и
## Topic 4: да, је, и, се, у
## Topic 5: и, је, у, на, се
## Topic 6: је, у, су, на, и
## Topic 7: у, и, је, су, на
## Topic 8: је, да, у, и, на
## Topic 9: ⚡️, 🇷🇸, 🇷🇺, и, infodefenseserbia
## Topic 10: 🇷🇸, ❤️, и, запратите, чат-бот
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 71 (approx. per word bound = -7.446, relative change = 1.636e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 72 (approx. per word bound = -7.446, relative change = 1.599e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 73 (approx. per word bound = -7.446, relative change = 1.541e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 74 (approx. per word bound = -7.445, relative change = 1.484e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 75 (approx. per word bound = -7.445, relative change = 1.407e-05)
## Topic 1: у, и, на, су, се
## Topic 2: 🇷🇸, #сербский, #српски, infodefenseserbia, infodefense
## Topic 3: t.me, у, 🇷🇸, buntcg, и
## Topic 4: да, је, и, се, у
## Topic 5: и, је, у, на, се
## Topic 6: је, у, су, на, и
## Topic 7: у, и, је, су, на
## Topic 8: је, да, у, и, на
## Topic 9: ⚡️, 🇷🇸, 🇷🇺, и, infodefenseserbia
## Topic 10: 🇷🇸, ❤️, и, запратите, чат-бот
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 76 (approx. per word bound = -7.445, relative change = 1.393e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 77 (approx. per word bound = -7.445, relative change = 1.477e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 78 (approx. per word bound = -7.445, relative change = 1.424e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 79 (approx. per word bound = -7.445, relative change = 1.324e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 80 (approx. per word bound = -7.445, relative change = 1.280e-05)
## Topic 1: у, и, на, су, се
## Topic 2: 🇷🇸, #сербский, #српски, infodefenseserbia, infodefense
## Topic 3: t.me, у, 🇷🇸, buntcg, и
## Topic 4: да, је, и, се, у
## Topic 5: и, је, у, на, се
## Topic 6: је, у, су, на, и
## Topic 7: у, и, је, су, на
## Topic 8: је, да, у, и, на
## Topic 9: ⚡️, 🇷🇸, 🇷🇺, и, за
## Topic 10: 🇷🇸, ❤️, и, запратите, чат-бот
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 81 (approx. per word bound = -7.445, relative change = 1.249e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 82 (approx. per word bound = -7.445, relative change = 1.180e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 83 (approx. per word bound = -7.445, relative change = 1.214e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 84 (approx. per word bound = -7.444, relative change = 1.253e-05)
## ....................................................................................................
## Completed E-Step (4 seconds).
## Completed M-Step.
## Completing Iteration 85 (approx. per word bound = -7.444, relative change = 1.249e-05)
## Topic 1: у, и, на, су, се
## Topic 2: 🇷🇸, #сербский, #српски, infodefenseserbia, infodefense
## Topic 3: t.me, у, 🇷🇸, buntcg, и
## Topic 4: да, је, и, се, у
## Topic 5: и, је, у, на, се
## Topic 6: је, у, су, на, и
## Topic 7: у, и, је, су, на
## Topic 8: је, да, у, и, на
## Topic 9: ⚡️, 🇷🇸, и, 🇷🇺, за
## Topic 10: 🇷🇸, ❤️, и, запратите, чат-бот
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 86 (approx. per word bound = -7.444, relative change = 1.273e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 87 (approx. per word bound = -7.444, relative change = 1.195e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 88 (approx. per word bound = -7.444, relative change = 1.086e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Completing Iteration 89 (approx. per word bound = -7.444, relative change = 1.015e-05)
## ....................................................................................................
## Completed E-Step (5 seconds).
## Completed M-Step.
## Model Converged
##
## === TOP REČI PO TEMI ===
## Topic 1 Top Words:
## Highest Prob: у, и, на, су, се, од, је, снага, снаге, оружаних
## FREX: напредовале, напредовали, крилу, клешејевке, бердичи, богдановке, вербовог, авдејевски, повукле, напредујући
## Lift: напредовале, @colonelassad, @infidefenseserbia, @starshii_pogrannaryada, @vdv_vistrel, @voenkors, 01-02.05.24, 03-04.05.24, 04-05.05.24, 05.29-30.24
## Score: оружане, правцу, оружаних, непријатељ, северно, борбе, правац, украјине, непријатеља, напредовање
## Topic 2 Top Words:
## Highest Prob: 🇷🇸, #сербский, #српски, infodefenseserbia, infodefense, пратите, нас, извор, ‼️, у
## FREX: силовик, ирна, 🇵🇸, либана, ✡️, ☪️, либан, 🇱🇧, @irna_ru, јемена
## Lift: @zparabellummd, af, iranist, владлену, галичина, јасин, јунису, катарски, осама, рафе
## Score: #српски, #сербский, infodefense, пратите, infodefenseserbia, 🇷🇸, ‼️, нас, рввоенкор, т.ме
## Topic 3 Top Words:
## Highest Prob: t.me, у, 🇷🇸, buntcg, и, је, на, се, су, да
## FREX: вучић, bunkersrb, саду, студената, ухапшен, факултета, шиптари, приштини, митровици, снс-а
## Lift: #закон, #промена_пола, ✅️, 1991-1995, borislav, choose, cy, eludba, emerald, fighter
## Score: t.me, buntcg, narodnapatrola, србије, српске, саду, вучић, београду, србија, студенти
## Topic 4 Top Words:
## Highest Prob: да, је, и, се, у, не, то, на, ће, а
## FREX: залужни, резидент, проблем, п.с, јасно, дешава, једноставно, чак, много, онда
## Lift: •, 1-3, 20а, 25а, 30-40, bradley, fp-5, god, march, participants
## Score: то, не, ће, да, подољака, ако, јуриј, али, сам, ли
## Topic 5 Top Words:
## Highest Prob: и, је, у, на, се, да, су, од, који, за
## FREX: светог, сестре, црква, православне, христа, христос, ⚪️, господ, манастир, православна
## Lift: #патриотизам, #сећање, #слободазаигора, #смрт, @infodefensemailbot, @kompasinfo_rs, @krozistoriju, ↪️, ✉️, 1941-1945
## Score: t.me, срби, ☦️, сестре, buntcg, српски, српска, светог, српске, свети
## Topic 6 Top Words:
## Highest Prob: је, у, су, на, и, се, од, да, који, за
## FREX: фсб, гладков, погинуло, сериала, гувернер, истражни, цивиле, направа, гелера, повређено
## Lift: #11марта, #wyf2024, #азов, #артемовскбахмут, #артјомовск, #бебе, #бившизатвореници, #биолошкелабораторије, #блокадаводе, #буча
## Score: ▪️, гувернер, повређено, људи, оружаних, украјине, је, пожар, повређених, гладков
## Topic 7 Top Words:
## Highest Prob: у, је, и, су, на, снага, украјине, области, оружаних, ▪️
## FREX: пво, летелице, беспилотне, противваздушне, складиште, посада, ланцет, искандер, противваздушна, хаубице
## Lift: ланцет, #dejanpetar, #dejanpetarzlatanovic, #dejanpetarzlatanović, #sns, #srbininfo, #srđannogo, #srdjannogo, #studentiublokadi, #важное
## Score: летелица, ракета, беспилотних, пво, беспилотне, оружаних, летелице, ракете, ▪️, дронова
## Topic 8 Top Words:
## Highest Prob: је, да, у, и, на, за, се, са, ће, сад
## FREX: милијарди, сједињене, сједињених, кина, санкција, бајдена, орбан, блумберг, что, самиту
## Lift: бајденова, брикс-а, политико, фицо, кина, санкција, сједињене, сједињених, что, #army2022
## Score: трамп, еу, ▪️, долара, 🇺🇸, сад, путин, председник, русија, украјини
## Topic 9 Top Words:
## Highest Prob: ⚡️, 🇷🇸, и, 🇷🇺, за, се, у, infodefenseserbia, infodefense, је
## FREX: палчеве, помолимо, @divgen, новопавловски, сумски, з-комитет, држимо, момке, курски, селидово-кураховски
## Lift: #jurijpodoljaka, američka, bukvalno, crvenom, divgen, dva, gaze, huta, huti, ih
## Score: ⚡️, палчеве, држимо, помолимо, момке, #јуријподољака, infodefense, #српски, #сербский, infodefenseserbia
## Topic 10 Top Words:
## Highest Prob: 🇷🇸, ❤️, и, запратите, чат-бот, infodefall, нас, истина, infodefenseserbia, знање
## FREX: ❤️, чат-бот, infodefall, знање, слобода, моћ, објаву, проследите, буревестник, доушник
## Lift: 955а, andré, appcloud-а, collonelassad, debeljaca1vrs, ees, eмиграната, eмигранти, fpv-дрон, grmi
## Score: ❤️, знање, чат-бот, infodefall, запратите, проследите, истина, објаву, слобода, драги
# Proporcije tema
topic_props <- as.data.table(stm_model$theta)
colnames(topic_props) <- paste0("Topic_", 1:K)
topic_props[, doc_id := seq_len(.N)]
# Prosečne proporcije tema
avg_topic_props <- colMeans(topic_props[, 1:K, with = FALSE])
cat("\n=== PROSEČNE PROPORCIJE TEMA ===\n")##
## === PROSEČNE PROPORCIJE TEMA ===
## Topic_3 Topic_4 Topic_2 Topic_8 Topic_1 Topic_7 Topic_9
## 0.15959777 0.13169090 0.11997297 0.10966132 0.09768904 0.09486269 0.08846085
## Topic_5 Topic_6 Topic_10
## 0.08529751 0.07686452 0.03590243
# Vizualizacija tema
p6 <- ggplot(data.table(
Topic = paste0("Topic_", 1:K),
Proportion = avg_topic_props
), aes(x = reorder(Topic, Proportion), y = Proportion)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(
title = "Proporcije tema",
x = "Tema",
y = "Proporcija"
) +
theme_minimal() +
theme(plot.title = element_text(size = 14, face = "bold"))
print(p6)# Mrežna analiza deljenja
if ("forwarded_from" %in% names(dt)) {
cat("=== MREŽNA ANALIZA DELJENJA ===\n")
# Kreiraj edge list (izvor -> grupa)
edges <- dt[!is.na(forwarded_from) & forwarded_from != "",
.(from = forwarded_from, to = group_anon)]
if (nrow(edges) > 0) {
# Kreiraj graf
g <- igraph::graph_from_data_frame(edges, directed = TRUE)
# Centralnosti
degree_centrality <- igraph::degree(g, mode = "in")
betweenness_centrality <- igraph::betweenness(g)
# Top izvori (najviše deljeno)
top_sources <- sort(degree_centrality, decreasing = TRUE)[1:min(10, length(degree_centrality))]
cat("\nTop izvori (najviše deljeno):\n")
print(head(top_sources, 10))
# Top hub grupe
top_hubs <- sort(igraph::degree(g, mode = "out"), decreasing = TRUE)[1:min(10, length(igraph::degree(g, mode = "out")))]
cat("\nTop hub grupe (najviše deli):\n")
print(head(top_hubs, 10))
# Vizualizacija (pojednostavljena)
if (igraph::vcount(g) <= 50) { # Samo za manje mreže
plot(g,
vertex.size = 5,
vertex.label.cex = 0.6,
edge.arrow.size = 0.3,
layout = igraph::layout_with_fr(g))
} else {
cat("Mreža prevelika za vizualizaciju (", igraph::vcount(g), "čvorova)\n")
}
} else {
cat("Nema podataka o deljenju za mrežnu analizu.\n")
}
} else {
cat("Kolona 'forwarded_from' nije dostupna. Preskačem mrežnu analizu.\n")
}## === SENTIMENT ANALIZA ===
## Napomena: Sentiment je pomoćna metrika, ne dokaz.
# Sentiment analiza (sentimentr)
# Napomena: sentimentr radi bolje sa engleskim, ali može dati osnovne indikatore
# Funkcija za merenje "uzbunjivanja"
measure_intensity <- function(text) {
if (is.na(text) || text == "") return(0)
intensity <- 0
# ALL CAPS ratio
caps_ratio <- str_count(text, "[A-ZА-Я]") / max(nchar(text), 1)
if (caps_ratio > 0.3) intensity <- intensity + 1
# Uzvičnici
exclamation_count <- str_count(text, "!")
intensity <- intensity + min(exclamation_count, 5) * 0.2
# "Hitno", "Sramota", itd.
alarm_words <- c("hitno", "urgentno", "sramota", "skandal", "katastrofa",
"opasnost", "pazite", "oprez")
for (word in alarm_words) {
if (grepl(paste0("\\b", word, "\\b"), text, ignore.case = TRUE)) {
intensity <- intensity + 0.5
}
}
return(intensity)
}
dt[, intensity := sapply(text_clean, measure_intensity)]
cat("Prosečan intenzitet:", round(mean(dt$intensity, na.rm = TRUE), 2), "\n")## Prosečan intenzitet: 0.04
## Postovi sa visokim intenzitetom (>= 2): 2
# Trend intenziteta
if ("date" %in% names(dt)) {
intensity_by_date <- dt[!is.na(date), .(avg_intensity = mean(intensity, na.rm = TRUE)),
by = date][order(date)]
p7 <- ggplot(intensity_by_date, aes(x = date, y = avg_intensity)) +
geom_line(color = "purple", linewidth = 1) +
geom_point(color = "purple", alpha = 0.6) +
labs(
title = "Trend intenziteta (uzbunjivanja) kroz vreme",
x = "Datum",
y = "Prosečan intenzitet"
) +
theme_minimal() +
theme(plot.title = element_text(size = 14, face = "bold"))
print(p7)
}## === SAŽETAK NALAZA ===
## 1. OPIS DATASETA
## - Ukupan broj postova: 34609
## - Broj grupa: 3
if ("date" %in% names(dt)) {
cat(" - Vremenski raspon:", min(dt$date, na.rm = TRUE), "do",
max(dt$date, na.rm = TRUE), "\n")
}## - Vremenski raspon: 18894 do 20434
##
## 2. KRAGUJEVAC
cat(" - Postovi koji pominju Kragujevac:", sum(dt$mentions_kg, na.rm = TRUE),
"(", round(100 * mean(dt$mentions_kg, na.rm = TRUE), 2), "%)\n")## - Postovi koji pominju Kragujevac: 37 ( 0.11 %)
##
## 3. ORGANIZOVANJE PROTESTA (CTA)
cat(" - Postovi sa CTA karakteristikama:", sum(dt$flag_cta, na.rm = TRUE),
"(", round(100 * mean(dt$flag_cta, na.rm = TRUE), 2), "%)\n")## - Postovi sa CTA karakteristikama: 189 ( 0.55 %)
## - Prosečan CTA score: 0.04
##
## 4. TEME I NARATIVI
## - Identifikovano tema: 10
## - Top termini i bigrami su prikazani u sekciji 'Propagandni Narativi i Teme'
##
## 5. INTENZITET
## - Prosečan intenzitet uzbunjivanja: 0.04
## - Postovi sa visokim intenzitetom: 2
##
## === NAPOMENA ===
## Ova analiza identifikuje obrasce u javnom diskursu.
## Rezultati su agregirani i anonimizovani u skladu sa etičkim principima.
Ograničenja analize:
# Export flagovanih postova (anonimizovano)
export_data <- dt[, .(
row_id,
group_anon,
date,
cta_score,
flag_cta,
mentions_kg,
intensity,
text_preview = substr(text_clean, 1, 200) # Samo preview
)]
# Sačuvaj u CSV
export_path_csv <- file.path(OUTPUT_DIR, "flagged_posts_anonimized.csv")
data.table::fwrite(export_data, export_path_csv)
cat("Eksportovano u:", export_path_csv, "\n")## Eksportovano u: output/flagged_posts_anonimized.csv
# Sačuvaj u Parquet (brže za sledeće učitavanje)
if (requireNamespace("arrow", quietly = TRUE)) {
export_path_parquet <- file.path(OUTPUT_DIR, "flagged_posts_anonimized.parquet")
arrow::write_parquet(export_data, export_path_parquet)
cat("Eksportovano u Parquet:", export_path_parquet, "\n")
}## Eksportovano u Parquet: output/flagged_posts_anonimized.parquet
# Sačuvaj agregirane statistike
stats_summary <- data.table(
metric = c(
"total_posts",
"total_groups",
"posts_mentioning_kg",
"posts_with_cta",
"avg_cta_score",
"avg_intensity"
),
value = c(
nrow(dt),
length(unique(dt$group_anon)),
sum(dt$mentions_kg, na.rm = TRUE),
sum(dt$flag_cta, na.rm = TRUE),
round(mean(dt$cta_score, na.rm = TRUE), 2),
round(mean(dt$intensity, na.rm = TRUE), 2)
)
)
stats_path <- file.path(OUTPUT_DIR, "summary_statistics.csv")
data.table::fwrite(stats_summary, stats_path)
cat("Statistike sačuvane u:", stats_path, "\n")## Statistike sačuvane u: output/summary_statistics.csv
## === REČNICI ===
## Kragujevac keywords:
## [1] "Kragujevac" "Kragujevcu" "Kragujevca" "Kragujevcem"
## [5] "KG" "kg" "K.G." "Šumadija"
## [9] "Šumadije" "Šumadiji" "Šumadijom" "Stanovo"
## [13] "Stanova" "Aerodrom" "Aerodroma" "Centar"
## [17] "Centru" "Centra" "Kragujevčani" "Kragujevčanima"
##
## CTA phrases:
## [1] "okupljanje" "okupljanja" "okupljanju" "protest" "protesta"
## [6] "protestu" "proteste" "večeras u" "večeras u" "sutra u"
## [11] "sutra u" "ponesite" "ponesite" "ponesite" "blokada"
## [16] "blokade" "blokadi" "skup" "skupa" "skupu"
##
## === SESSION INFO ===
## R version 4.5.0 (2025-04-11 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: Europe/Budapest
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] kableExtra_1.4.0 knitr_1.50 sentimentr_2.9.0 stringr_1.5.1
## [5] wordcloud_2.6 RColorBrewer_1.1-3 DT_0.33 ggplot2_3.5.2
## [9] igraph_2.2.1 stm_1.3.8 tidytext_0.4.2 quanteda_4.3.1
## [13] lubridate_1.9.4 dplyr_1.1.4 arrow_22.0.0 data.table_1.17.4
##
## loaded via a namespace (and not attached):
## [1] fastmatch_1.1-6 gtable_0.3.6 xfun_0.52 bslib_0.9.0
## [5] htmlwidgets_1.6.4 lattice_0.22-6 tzdb_0.5.0 vctrs_0.6.5
## [9] tools_4.5.0 generics_0.1.4 tibble_3.2.1 janeaustenr_1.0.0
## [13] pkgconfig_2.0.3 tokenizers_0.3.0 Matrix_1.7-3 assertthat_0.2.1
## [17] lifecycle_1.0.4 compiler_4.5.0 farver_2.1.2 textshaping_1.0.1
## [21] codetools_0.2-20 qdapRegex_0.7.10 SnowballC_0.7.1 htmltools_0.5.8.1
## [25] sass_0.4.10 yaml_2.3.10 pillar_1.10.2 jquerylib_0.1.4
## [29] cachem_1.1.0 lexicon_1.2.1 stopwords_2.3 tidyselect_1.2.1
## [33] digest_0.6.37 stringi_1.8.7 purrr_1.0.4 labeling_0.4.3
## [37] fastmap_1.2.0 grid_4.5.0 cli_3.6.5 magrittr_2.0.3
## [41] textclean_0.9.3 withr_3.0.2 scales_1.4.0 bit64_4.6.0-1
## [45] timechange_0.3.0 rmarkdown_2.29 matrixStats_1.5.0 bit_4.6.0
## [49] evaluate_1.0.5 viridisLite_0.4.2 rlang_1.1.6 Rcpp_1.0.14
## [53] syuzhet_1.0.7 glue_1.8.0 xml2_1.3.8 svglite_2.2.2
## [57] rstudioapi_0.17.1 jsonlite_2.0.0 R6_2.6.1 systemfonts_1.3.1
Kraj izveštaja
Generisano: 2025-12-15