# Libraries
pacman::p_load(tidyverse, countrycode, sf, state, htmltools, urltools, htmlwidgets, janitor, RColorBrewer, ussf, arrow, hrbrthemes, lubridate, estimatr, texreg, readxl, corrr, wordcloud, tm, topicmodels, ldatuning, lda)

options(arrow.skip_nul = TRUE)

# Load data

df_yandex <- read_parquet(here::here("data/shared/webconf2023/yandex/yandex_trends_page1.parquet"))

df_google <- 
  bind_rows(
    read_parquet(here::here("data/shared/webconf2023/russia.parquet")),
    read_parquet(here::here("data/shared/webconf2023/ukraine.parquet")), 
    read_parquet(here::here("data/shared/webconf2023/united_kingdom.parquet")),
    read_parquet(here::here("data/shared/webconf2023/united_states.parquet"))
  )

# getting domain root
df_google <-
  df_google %>% 
  bind_cols(
    suffix_extract(df_google$domain) %>% transmute(domain_clean = domain)
  ) %>% 
  mutate(
    rsm_domain = str_detect(domain_clean, "^rt$|^ria$|^tass$|^lenta$|^sputniknews$|^tvzvezda$|^redfish$|^gazeta$|^ruptly$"),
  )

df_yandex <-
  df_yandex %>% 
  bind_cols(
    suffix_extract(df_yandex$domain) %>% transmute(domain_clean = domain)
  ) %>% 
  mutate(
    rsm_domain = str_detect(domain_clean, "^rt$|^ria$|^tass$|^lenta$|^sputniknews$|^tvzvezda$|^redfish$|^gazeta$|^ruptly$"),
  )

1 Friction

1.1 Russian State Media Presence

1.1.1 Yandex Only

1.1.2 Yandex vs Google Web

Most popular RSM domains on first web results page by country/search engine are seen below. Negligible results from US/UK as RSM results are down-weighted and hence occur on later pages.

bind_rows(
  "Yandex Web" = df_yandex %>% mutate(country = "russia_yandex"),
  "Google Web Russia" = df_google %>% filter(country == "russia", rank_group <= 14, type == "organic"),
  "Google Web Ukraine" = df_google %>% filter(country == "ukraine", rank_group <= 14, type == "organic"),
  "Google Web US" = df_google %>% filter(country == "united_states", rank_group <= 14, type == "organic"),
  "Google Web UK" = df_google %>% filter(country == "united_kingdom", rank_group <= 14, type == "organic"),
  .id = "dataset"
) %>%
  filter(rsm_domain) %>% 
  tabyl(domain, country) %>% 
  as_tibble() %>% 
  arrange(-russia) %>% 
  transmute(domain = str_c('<a href="http://', domain, '" target="_blank">', domain, '</a>'),
            russia_google = russia, russia_yandex, ukraine_google = ukraine, uk_google = united_kingdom, us_google = united_states) %>%
  DT::datatable(escape = F, options = list(pageLength = 10))

Variation with time:

bind_rows(
  "Yandex Web" = df_yandex,
  "Google Web Russia" = df_google %>% filter(country == "russia", rank_group <= 14, type == "organic"),
  "Google Web Ukraine" = df_google %>% filter(country == "ukraine", rank_group <= 14, type == "organic"),
  "Google Web US" = df_google %>% filter(country == "united_states", rank_group <= 14, type == "organic"),
  "Google Web UK" = df_google %>% filter(country == "united_kingdom", rank_group <= 14, type == "organic"),
  .id = "dataset"
) %>%
  group_by(date_clean, dataset) %>% 
  summarize(
    prop_rsm = mean(rsm_domain, na.rm = T)
  ) %>% 
  filter(year(date_clean) > 2021) %>% 
  ggplot(aes(date_clean, prop_rsm)) +
  geom_smooth(method = "loess", se = F, span = 0.1, aes(color = dataset)) +
  geom_vline(xintercept = as.POSIXct(as.Date("2022-02-24")), linetype = "dashed", color = "red") +
  scale_x_datetime(breaks = "1 month", labels = scales::date_format("%B")) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = -45), legend.position = "bottom") +
  labs(
    x = "", y = "% Russian State Media Results on First Page", color = ""
  )

1.1.3 Yandex vs Google News

Most popular RSM domains on first news results page by country/search engine are seen below. Negligible results from US/UK as RSM results are down-weighted and hence occur on later pages.

bind_rows(
  "Yandex Web" = df_yandex %>% mutate(country = "russia_yandex_web"),
  "Google Web Russia" = df_google %>% filter(country == "russia", rank_group <= 14, type == "news_search"),
  "Google Web Ukraine" = df_google %>% filter(country == "ukraine", rank_group <= 14, type == "news_search"),
  "Google Web US" = df_google %>% filter(country == "united_states", rank_group <= 14, type == "news_search"),
  "Google Web UK" = df_google %>% filter(country == "united_kingdom", rank_group <= 14, type == "news_search"),
  .id = "dataset"
) %>%
  filter(rsm_domain) %>% 
  tabyl(domain, country) %>% 
  as_tibble() %>% 
  arrange(-russia) %>% 
  transmute(domain = str_c('<a href="http://', domain, '" target="_blank">', domain, '</a>'),
            russia_gnews = russia, russia_yandex_web, ukraine_gnews = ukraine, uk_gnews = united_kingdom, us_gnews = united_states) %>%
  DT::datatable(escape = F, options = list(pageLength = 10))

Variation with time:

bind_rows(
  "Yandex Web" = df_yandex,
  "Google News Russia" = df_google %>% filter(country == "russia", rank_group <= 14, type == "news_search"),
  "Google News Ukraine" = df_google %>% filter(country == "ukraine", rank_group <= 14, type == "news_search"),
  "Google News US" = df_google %>% filter(country == "united_states", rank_group <= 14, type == "news_search"),
  "Google News UK" = df_google %>% filter(country == "united_kingdom", rank_group <= 14, type == "news_search"),
  .id = "dataset"
) %>%
  group_by(date_clean, dataset) %>% 
  summarize(
    prop_rsm = mean(rsm_domain, na.rm = T)
  ) %>% 
  filter(year(date_clean) > 2021) %>% 
  ggplot(aes(date_clean, prop_rsm)) +
  geom_smooth(method = "loess", se = F, span = 0.1, aes(color = dataset)) +
  geom_vline(xintercept = as.POSIXct(as.Date("2022-02-24")), linetype = "dashed", color = "red") +
  scale_x_datetime(breaks = "1 month", labels = scales::date_format("%B")) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = -45), legend.position = "bottom") +
  labs(
    x = "", y = "% Russian State Media Results on First Page", color = ""
  )

1.2 Variation of RSM Presence with Google Search Rank (Exhausting All Pages)

1.2.1 Google web search

Web search upweights RSM in Russia and Ukraine, and downweights RSM in US and UK.

bind_rows(
  "Google Web Russia" = df_google %>% filter(country == "russia", type == "organic"),
  "Google Web Ukraine" = df_google %>% filter(country == "ukraine", type == "organic"),
  "Google Web US" = df_google %>% filter(country == "united_states", type == "organic"),
  "Google Web UK" = df_google %>% filter(country == "united_kingdom", type == "organic"),
  .id = "dataset"
) %>%
  drop_na(domain_clean) %>% 
  group_by(country, rsm_domain) %>% 
  summarize(mean_result_rank = mean(rank_group, na.rm = T)) %>% 
  mutate(country = str_replace_all(country, "_", " ") %>% str_to_title()) %>%
  ggplot(aes(country, mean_result_rank)) +
  geom_col(aes(fill = rsm_domain), position = "dodge", alpha = 0.85) +
  scale_y_continuous(breaks = scales::breaks_width(10)) +
  scale_fill_brewer(palette = "Set1") +
  theme_minimal() +
  labs(
    x = "", y = "Mean Result Rank", fill = "RSM Domain"
  )

1.2.2 Google news search

No such relative weighting patterns visible across nations.

bind_rows(
  "Google News Russia" = df_google %>% filter(country == "russia", type == "news_search"),
  "Google News Ukraine" = df_google %>% filter(country == "ukraine", type == "news_search"),
  "Google News US" = df_google %>% filter(country == "united_states", type == "news_search"),
  "Google News UK" = df_google %>% filter(country == "united_kingdom", type == "news_search"),
  .id = "dataset"
) %>%
  drop_na(domain_clean) %>% 
  group_by(country, rsm_domain) %>% 
  summarize(mean_result_rank = mean(rank_group, na.rm = T)) %>% 
  mutate(country = str_replace_all(country, "_", " ") %>% str_to_title()) %>% 
  ggplot(aes(country, mean_result_rank)) +
  geom_col(aes(fill = rsm_domain), position = "dodge", alpha = 0.85) +
  scale_y_continuous(breaks = scales::breaks_width(10)) +
  scale_fill_brewer(palette = "Set1") +
  theme_minimal() +
  labs(
    x = "", y = "Mean Result Rank", fill = "RSM Domain"
  )

2 Flooding

What is RSM talking about?

2.1 Topic Models - US

data_vector <-
  df_google %>%
  filter(country == "united_states") %>% 
  filter(rsm_domain) %>%
  transmute(title, description = str_remove_all(description, "N/A")) %>% 
  unite("text", c(title, description), na.rm = T, remove = T, sep = ". ") %>%
  mutate(text = str_to_lower(text) %>% str_remove_all("russia|russian|ukraine|ukrainian")) %>% 
  pull(text)

# Create corpus
corpus <- Corpus(VectorSource(data_vector))

# Clean corpus
docs <-
  corpus %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)


minimumFrequency <- 5
DTM <- DocumentTermMatrix(docs, control = list(bounds = list(global = c(minimumFrequency, Inf))))

sel_idx <- slam::row_sums(DTM) > 0
DTM <- DTM[sel_idx, ]

We move to inspect the number of topics. In this case, we use two methods CaoJuan2009 and Griffith2004. The best number of topics shows low values for CaoJuan2009 and high values for Griffith2004.

result <- 
  ldatuning::FindTopicsNumber(
    DTM,
    topics = seq(from = 2, to = 10, by = 1),
    metrics = c("CaoJuan2009",  "Deveaud2014"),
    method = "Gibbs",
    control = list(seed = 77),
    verbose = TRUE
  )

## fit models... done.
## calculate metrics:
##   CaoJuan2009... done.
##   Deveaud2014... done.

ldatuning::FindTopicsNumber_plot(result)

Based on the plots above, 3 seems to be a decent number of topics to start with, as it has low CaoJuan2009 value and high Deveaud2014 value.

K <- 3

# set random number generator seed
set.seed(94305)

# compute the LDA model, inference via 1000 iterations of Gibbs sampling
topicModel <- LDA(DTM, K, method = "Gibbs", control = list(iter = 1000, verbose = 25))

## K = 3; V = 184; M = 305
## Sampling 1000 iterations!
## Iteration 25 ...
## Iteration 50 ...
## Iteration 75 ...
## Iteration 100 ...
## Iteration 125 ...
## Iteration 150 ...
## Iteration 175 ...
## Iteration 200 ...
## Iteration 225 ...
## Iteration 250 ...
## Iteration 275 ...
## Iteration 300 ...
## Iteration 325 ...
## Iteration 350 ...
## Iteration 375 ...
## Iteration 400 ...
## Iteration 425 ...
## Iteration 450 ...
## Iteration 475 ...
## Iteration 500 ...
## Iteration 525 ...
## Iteration 550 ...
## Iteration 575 ...
## Iteration 600 ...
## Iteration 625 ...
## Iteration 650 ...
## Iteration 675 ...
## Iteration 700 ...
## Iteration 725 ...
## Iteration 750 ...
## Iteration 775 ...
## Iteration 800 ...
## Iteration 825 ...
## Iteration 850 ...
## Iteration 875 ...
## Iteration 900 ...
## Iteration 925 ...
## Iteration 950 ...
## Iteration 975 ...
## Iteration 1000 ...
## Gibbs sampling completed!

# have a look a some of the results (posterior distributions)
tmResult <- posterior(topicModel)

# format of the resulting object
# attributes(tmResult)
# nTerms(DTM)

# topics are probability distributions over the entire vocabulary
beta <- tmResult$terms   # get beta from results

# for every document we have a probability distribution of its contained topics
theta <- tmResult$topics

# Top 10 terms per topic
#terms(topicModel, 10)

Word cloud for topic 1:

# visualize topics as word cloud

topicToViz <- 1 # change for your own topic of interest

# select to 100 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top100terms <- sort(tmResult$terms[topicToViz,], decreasing = TRUE)[1:100]
words <- names(top100terms)

# extract the probabilites of each of the 100 terms
probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:100]

# visualize the terms as wordcloud
mycolors <- brewer.pal(8, "Dark2")

wordcloud(words, probabilities, random.order = FALSE, color = mycolors)

Word cloud for topic 2:

# visualize topics as word cloud

topicToViz <- 2 # change for your own topic of interest

# select to 100 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top100terms <- sort(tmResult$terms[topicToViz,], decreasing = TRUE)[1:100]
words <- names(top100terms)

# extract the probabilites of each of the 100 terms
probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:100]

# visualize the terms as wordcloud
mycolors <- brewer.pal(8, "Dark2")

wordcloud(words, probabilities, random.order = FALSE, color = mycolors)

Word cloud for topic 3:

# visualize topics as word cloud

topicToViz <- 3 # change for your own topic of interest

# select to 100 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top100terms <- sort(tmResult$terms[topicToViz,], decreasing = TRUE)[1:100]
words <- names(top100terms)

# extract the probabilites of each of the 100 terms
probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:100]

# visualize the terms as wordcloud
mycolors <- brewer.pal(8, "Dark2")

wordcloud(words, probabilities, random.order = FALSE, color = mycolors)

2.2 Topic Models - UK

data_vector <-
  df_google %>%
  filter(country == "united_kingdom") %>% 
  filter(rsm_domain) %>%
  transmute(title, description = str_remove_all(description, "N/A")) %>% 
  unite("text", c(title, description), na.rm = T, remove = T, sep = ". ") %>%
  mutate(text = str_to_lower(text) %>% str_remove_all("russia|russian|ukraine|ukrainian")) %>% 
  pull(text)

# Create corpus
corpus <- Corpus(VectorSource(data_vector))

# Clean corpus
docs <-
  corpus %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)


minimumFrequency <- 5
DTM <- DocumentTermMatrix(docs, control = list(bounds = list(global = c(minimumFrequency, Inf))))

sel_idx <- slam::row_sums(DTM) > 0
DTM <- DTM[sel_idx, ]