Se aplica LDA (n = 20) considerando cada entrada de desinformación como un documento. Se muestan los ochos términos más caracterÃsticos para cada topic. Los cluster parecen congruentes se diferencian bastante entre topics las palabras más caracterÃsticas de cada uno.
stop_words_vc <- paste(stop_words$word, collapse = " | ")
read_tsv(here::here("data_web", "data_20200504T164147.tsv")) %>%
select(summary, country) %>%
rowid_to_column(var = "document") %>%
mutate(document = paste(document, "document")) %>%
unnest_tokens(word, summary) %>%
filter(!word %in% stop_words$word) %>%
# mutate_at("word", ~SnowballC::wordStem(., language="en")) %>%
mutate(country = str_remove(country, "^Country: ")) %>%
filter(!str_detect(country, "Keywords: ")) %>%
mutate(country = str_replace_all(country,
"Bosnia and Herzegovina",
"BosniaHerzegovina")) %>%
mutate(country = str_remove_all(country, ",?and\\s+"),
with_stop_words = str_detect(country, stop_words_vc),
new_country = countrycode::countrycode(country,
"country.name",
"country.name",
nomatch = "None",
warn = F),
country = case_when(
with_stop_words == T ~ new_country,
TRUE ~ country
)) %>%
select(document:word) -> tidy_words
tidy_words %>%
count(document, word, sort = T) %>%
cast_dtm(document, word, n) -> disinfo_dtm
disinfo_lda <- topicmodels::LDA(disinfo_dtm, k = 20, control = list(seed = 1234))
disinfo_lda %>%
tidy() %>%
group_by(topic) %>%
top_n(8, beta) %>%
ungroup() -> disinfor_lda_tidy
disinfor_lda_tidy %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~topic, ncol = 3, scales = "free") +
coord_flip() +
scale_x_reordered()
Notas: - Seguir limpiando country - Menos o más topics¿? - stemming?
La siguiente tabla muestra los topics y los
disinfo_lda %>%
tidy(matrix = "gamma") %>%
left_join(tidy_words %>%
select(document, country) %>%
distinct()) %>%
arrange(topic, desc(gamma)) %>%
group_by(topic) %>%
top_n(4, wt = gamma) %>%
# separate_rows(country, sep = ", ") %>%
select(topic, country) %>%
summarise(country = paste(country, collapse = "\n")) %>%
ungroup() %>%
left_join(disinfor_lda_tidy %>%
group_by(topic) %>%
summarise(term = paste(term, collapse = " "))) %>%
mutate(topic = paste("Topic", topic)) %>%
group_by(topic) %>%
gt::gt()
| country | term |
|---|---|
| Topic 1 | |
| Latvia, Russia, Estonia, Lithuania, Germany USSR, Germany, Poland USSR, Russia, Germany, Poland USSR, Russia, Germany | soviet poland polish history union germany ussr nazi |
| Topic 2 | |
| Russia, US Russia, US Russia, US Kazakhstan | russian russia trump elections german media election presidential |
| Topic 3 | |
| Russia, Ukraine Malaysia, Russia, Ukraine, The Netherlands Australia, Malaysia, Russia, Ukraine, The Netherlands Ukraine, The Netherlands | russian russia missile skripal investigation information evidence mh17 |
| Topic 4 | |
| Brazil, US Russia, US Norway, Germany US | russia war nato civil world west finland cold |
| Topic 5 | |
| Uzbekistan, Armenia, Azerbaijan, Moldova, Ukraine, Georgia EU, Portugal, Greece, Germany, Spain EU, Germany EU, Italy, Germany | russia european eu countries sanctions europe union economic |
| Topic 6 | |
| Russia, Ukraine Ukraine Russia, Ukraine Russia, Ukraine | ukraine russia ukrainian donbas kyiv coup 2014 conflict |
| Topic 7 | |
| Russia, Ukraine Russia, Sweden Russia Russia | russian russia crimea international 2014 putin referendum crimean |
| Topic 8 | |
| Russia, Ukraine US Uzbekistan, Tajikistan, Armenia, Kazakhstan, Azerbaijan, Moldova, Russia, Ukraine, Belarus, Georgia Russia, Belarus, US, Poland | belarus russian russia russians policy west western anti |
| Topic 9 | |
| US US Sweden EU, Canada, US | global coronavirus system world people orthodox church 19 |
| Topic 10 | |
| UK, Belarus Israel, UK, Belarus, US, Sweden Romania, Moldova Georgia | belarus belarusian soros opposition society people children rights |
| Topic 11 | |
| China, US, France Cuba, Canada, China, US China, UK, US China, UK | china coronavirus chinese laboratory american virus biological spread |
| Topic 12 | |
| Bulgaria, Georgia Australia, Czech Republic Slovakia, Russia Slovakia, Russia | military ukrainian czech republic ministry donetsk people’s luhansk |
| Topic 13 | |
| Afghanistan, Syria US, Syria Iraq, US, Syria Iran, Russia | russian syria turkish estonia turkey isis trend week’s |
| Topic 14 | |
| Ukraine Ukraine Ukraine Ukraine | ukrainian foreign people country authorities minister citizens nazis |
| Topic 15 | |
| Venezuela, US Israel, Russia US, Georgia Georgia | nato georgia south georgian security terrorism united threat |
| Topic 16 | |
| Russia, US Latvia, Russia, Estonia, Lithuania, US Russia, US Russia | belarus russian russia countries military nato nuclear europe |
| Topic 17 | |
| Russia, Ukraine Russia, Ukraine Russia, Ukraine Russia, Ukraine | russian poroshenko ukrainian president army soldiers law petro |
| Topic 18 | |
| Syria Syria Russia, Syria Russia, US, Syria | chemical weapons syria helmets white syrian terrorist attack |
| Topic 19 | |
| France France France France | fire migrants france treaty french police inf million |
| Topic 20 | |
| Venezuela Denmark Japan, US Venezuela, US | president country refugees government east middle revolution colour |
stop_words_vc <- paste(stop_words$word, collapse = " | ")
read_tsv(here::here("data_web", "data_20200504T164147.tsv")) %>%
filter(str_detect(keywords, pattern = "coronavirus")) %>%
select(summary, country) %>%
rowid_to_column(var = "document") %>%
mutate(document = paste(document, "document")) %>%
unnest_tokens(word, summary) %>%
filter(!word %in% stop_words$word) %>%
# mutate_at("word", ~SnowballC::wordStem(., language="en")) %>%
mutate(country = str_remove(country, "^Country: ")) %>%
filter(!str_detect(country, "Keywords: ")) %>%
mutate(country = str_replace_all(country,
"Bosnia and Herzegovina",
"BosniaHerzegovina")) %>%
mutate(country = str_remove_all(country, ",?and\\s+"),
with_stop_words = str_detect(country, stop_words_vc),
new_country = countrycode::countrycode(country,
"country.name",
"country.name",
nomatch = "None",
warn = F),
country = case_when(
with_stop_words == T ~ new_country,
TRUE ~ country
)) %>%
select(document:word) -> tidy_words
tidy_words %>%
count(document, word, sort = T) %>%
cast_dtm(document, word, n) -> disinfo_dtm
disinfo_lda <- topicmodels::LDA(disinfo_dtm, k = 9, control = list(seed = 1234))
disinfo_lda %>%
tidy() %>%
group_by(topic) %>%
top_n(8, beta) %>%
ungroup() -> disinfor_lda_tidy
disinfor_lda_tidy %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~topic, ncol = 3, scales = "free") +
coord_flip() +
scale_x_reordered()
disinfo_lda %>%
tidy(matrix = "gamma") %>%
left_join(tidy_words %>%
select(document, country) %>%
distinct()) %>%
arrange(topic, desc(gamma)) %>%
group_by(topic) %>%
top_n(4, wt = gamma) %>%
# separate_rows(country, sep = ", ") %>%
select(topic, country) %>%
summarise(country = paste(country, collapse = "\n")) %>%
ungroup() %>%
left_join(disinfor_lda_tidy %>%
group_by(topic) %>%
summarise(term = paste(term, collapse = " "))) %>%
mutate(topic = paste("Topic", topic)) %>%
group_by(topic) %>%
gt::gt()
| country | term |
|---|---|
| Topic 1 | |
| US China, US Russia EU, China, US | eu china countries coronavirus world italy epidemic people |
| Topic 2 | |
| EU, Italy, Russia, US EU, Czech Republic EU, Canada, US EU | eu coronavirus world virus russia crisis 19 covid |
| Topic 3 | |
| US US US US | european coronavirus global world virus pandemic economic system |
| Topic 4 | |
| China, Russia, US Iran, China, Russia, US, France China, US, France China, UK, US | china coronavirus chinese virus biological laboratory american covid |
| Topic 5 | |
| Italy, Germany, France Switzerland, US Italy, China, Latvia, Russia, Baltic states, Estonia, Lithuania, US Greece | eu countries european coronavirus europe italy pandemic union |
| Topic 6 | |
| UK US Russia, Ukraine Italy, US | coronavirus global epidemic ukraine pandemic russia ukrainian people |
| Topic 7 | |
| US China, Italy, UK, US China, UK, US Belarus | china coronavirus world virus biological people population american |
| Topic 8 | |
| US, France China, Russia, US Italy, Russia, Ukraine US | china coronavirus russian italy russia situation 19 aid |
| Topic 9 | |
| China, UK, Russia, Ukraine, US US EU, US, Germany Italy, Greece, UK, Latvia, Russia, Lithuania, US, Belgium | china coronavirus world western russian russia people doctors |