This appendix provides supplementary information for the thesis on Topic
Insert English to explain data management.
# ============================================================
# Loading Libraries
# ============================================================
library(dplyr)
library(stringr)
library(tidyr)
library(tibble)
library(readxl)
library(ggplot2)
library(scales)
library(tidytext)
library(quanteda)
library(patchwork)
# ============================================================
# Data Preparation and Management
# ============================================================
cop28_files <- list.files(pattern = "COP_28_comment_nr_.*\\.xlsx")
cop29_files <- list.files(pattern = "COP_29_comment_nr_.*\\.xlsx")
cop28 <- bind_rows(lapply(cop28_files, read_excel)) %>% mutate(session = "COP28")
cop29 <- bind_rows(lapply(cop29_files, read_excel)) %>% mutate(session = "COP29")
all_comments <- bind_rows(cop28, cop29) %>%
transmute(
session,
conf = session,
comment_content = as.character(comment_content)
) %>%
mutate(
comment_content = str_to_lower(comment_content),
comment_content = str_replace_all(comment_content, "http\\S+|www\\S+", " "),
comment_content = str_replace_all(comment_content, "[^a-z\\s]", " "),
comment_content = str_squish(comment_content)
) %>%
filter(!is.na(comment_content), nchar(comment_content) > 0)
custom_stop <- c(
"comment", "deleted", "removed", "http", "https", "www", "reddit",
"cop28", "cop29", "bla", "alex", "newman", "baker", "creek", "gates",
"zayed", "sunak", "anderson", "sultan", "charles", "aliyev", "kerry",
"starmer", "gore", "pannier", "soylent", "sincerely", "gotta", "rent",
"kool", "aid", "baku", "karabakh", "azerbaijani", "archived", "version",
"charts", "ourworldindata", "worldometers", "freedomhouse", "na",
"thrown", "dioxide", "prematurely", "billion"
)
all_stopwords <- bind_rows(
stop_words,
tibble(word = custom_stop, lexicon = "custom")
)
bigrams_sep <- all_comments %>%
unnest_tokens(bigram, comment_content, token = "ngrams", n = 2) %>%
filter(!is.na(bigram)) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(
!word1 %in% all_stopwords$word,
!word2 %in% all_stopwords$word,
!str_detect(word1, "\\d"),
!str_detect(word2, "\\d")
) %>%
unite(bigram, word1, word2, sep = " ") %>%
mutate(bigram = case_when(
bigram == "fossil fuels" ~ "fossil fuel",
bigram == "greenhouse gases" ~ "greenhouse gas",
bigram == "developing nations" ~ "developing nation",
bigram == "developing countries" ~ "developing country",
bigram == "carbon emissions" ~ "carbon emission",
bigram == "gas emissions" ~ "gas emission",
bigram == "poor nations" ~ "poor nation",
TRUE ~ bigram
))
bigrams_themed <- bigrams_sep %>%
mutate(theme = case_when(
str_detect(bigram, "finance|financing|investment|fund|funding|ncqg|donor|aid|grant|accelerator|guilt") ~ "finance",
str_detect(bigram, "equity|justice|developing|poor nation|global south|vulnerable|responsibility") ~ "justice",
str_detect(bigram, "renewable|fossil|fuel|oil|gas|coal|solar|wind|nuclear|electricity|greenhouse") ~ "energy",
str_detect(bigram, "adaptation|resilience|flood|drought|disaster|sea level|heat wave|damage|vulnerability") ~ "adaptation",
str_detect(bigram, "carbon|market|pricing|technology|ccs|emission|neutral") ~ "markets",
TRUE ~ NA_character_
)) %>%
filter(!is.na(theme))
# Top thematic bigrams (Figure 1)
top_bigrams_plot <- bigrams_themed %>%
count(session, bigram, sort = TRUE) %>%
group_by(session) %>%
slice_max(n, n = 10, with_ties = FALSE) %>%
ungroup() %>%
mutate(
bigram = str_to_title(bigram),
bigram = reorder_within(bigram, n, session)
)
# Frequency per 10k for the diverging-shift figure
bigram_freq <- bigrams_sep %>%
count(session, bigram) %>%
group_by(session) %>%
mutate(
total = sum(n),
freq_per10k = n / total * 10000
) %>%
ungroup()
corp <- corpus(all_comments, text_field = "comment_content")
toks <- tokens(corp,
remove_punct = TRUE,
remove_numbers = TRUE,
remove_symbols = TRUE) %>%
tokens_tolower() %>%
tokens_remove(stopwords("en")) %>%
tokens_remove(pattern = c("https", "http", "www", "wikipedia", "wiki", "en")) %>%
tokens_keep(min_nchar = 3)
toks_bg <- tokens_ngrams(toks, n = 2)
dfm_bg <- dfm(toks_bg)
dfm_g <- dfm_group(dfm_bg, groups = all_comments$conf)
# Counts per group (used for cross-corpus appearance)
counts_long <- convert(dfm_g, to = "data.frame") %>%
rename(conf = doc_id) %>%
pivot_longer(-conf, names_to = "bigram", values_to = "count") %>%
mutate(bigram = str_replace_all(bigram, "_", " "))
totals <- counts_long %>%
group_by(conf) %>%
summarise(total_bigrams = sum(count, na.rm = TRUE), .groups = "drop")
# TF-IDF
tfidf <- dfm_tfidf(dfm_g)
tfidf_long <- convert(tfidf, to = "data.frame") %>%
rename(conf = doc_id) %>%
pivot_longer(-conf, names_to = "bigram", values_to = "tf_idf") %>%
filter(tf_idf > 0) %>%
mutate(bigram = str_replace_all(bigram, "_", " ")) %>%
filter(str_detect(bigram, " "))
themes <- list(
finance = c("finance", "financing", "investment", "ncqg", "fund", "funding"),
justice = c("equity", "justice", "climate justice", "cbdrrc", "responsibility"),
energy = c("energy", "renewable", "fossil", "fuel", "oil", "gas", "coal"),
adaptation = c("adaptation", "resilience", "loss", "damage", "vulnerability"),
markets = c("carbon", "market", "pricing", "article", "technology", "ccs")
)
theme_lookup <- tibble(
theme = rep(names(themes), times = sapply(themes, length)),
keyword = unlist(themes)
) %>%
mutate(keyword = str_to_lower(keyword))
theme_order <- names(themes)
# Tag policy-relevant bigrams
policy_bigrams_one <- tfidf_long %>%
crossing(theme_lookup) %>%
filter(str_detect(bigram, fixed(keyword))) %>%
distinct(conf, bigram, tf_idf, theme) %>%
mutate(theme = factor(theme, levels = theme_order)) %>%
arrange(conf, bigram, theme) %>%
group_by(conf, bigram) %>%
slice(1) %>%
ungroup()
top_policy_phrases_top10 <- policy_bigrams_one %>%
group_by(conf) %>%
slice_max(tf_idf, n = 10, with_ties = FALSE) %>%
ungroup() %>%
mutate(bigram = str_to_sentence(bigram))
score_terms_across <- function(terms, source_conf, target_conf,
counts_long, tfidf_long, totals) {
src_total <- totals$total_bigrams[totals$conf == source_conf]
tgt_total <- totals$total_bigrams[totals$conf == target_conf]
tibble(bigram = terms) %>%
left_join(counts_long %>% filter(conf == source_conf) %>%
select(bigram, src_count = count), by = "bigram") %>%
left_join(counts_long %>% filter(conf == target_conf) %>%
select(bigram, tgt_count = count), by = "bigram") %>%
left_join(tfidf_long %>% filter(conf == source_conf) %>%
select(bigram, src_tfidf = tf_idf), by = "bigram") %>%
left_join(tfidf_long %>% filter(conf == target_conf) %>%
select(bigram, tgt_tfidf = tf_idf), by = "bigram") %>%
mutate(
src_count = replace_na(src_count, 0),
tgt_count = replace_na(tgt_count, 0),
src_per10k = (src_count / src_total) * 10000,
tgt_per10k = (tgt_count / tgt_total) * 10000,
carryover_ratio = ifelse(src_per10k == 0, NA_real_, tgt_per10k / src_per10k)
) %>%
arrange(desc(tgt_per10k), desc(tgt_count))
}
cop28_terms <- c(
"renewable share", "natural gas", "coal power", "coal production",
"coal electricity", "coal based", "buying oil", "blocking progress",
"baffling oil", "authoritarian oil"
)
cop29_terms <- c(
"developing nation", "developing country", "russian gas", "oil drilling",
"expanding coal", "developing world", "burn coal", "actual action",
"accelerator fund", "greenhouse gas", "gas emissions"
)
cop28_in_cop29 <- score_terms_across(
terms = cop28_terms,
source_conf = "COP28", target_conf = "COP29",
counts_long = counts_long, tfidf_long = tfidf_long, totals = totals
)
cop29_in_cop28 <- score_terms_across(
terms = cop29_terms,
source_conf = "COP29", target_conf = "COP28",
counts_long = counts_long, tfidf_long = tfidf_long, totals = totals
)
plot_28_shift <- cop28_in_cop29 %>%
select(bigram, src_per10k, tgt_per10k) %>%
pivot_longer(c(src_per10k, tgt_per10k), names_to = "where", values_to = "per10k") %>%
mutate(
COP = recode(where,
src_per10k = "COP28 (original corpus)",
tgt_per10k = "COP29 (appearance of COP28 terms)"),
bigram = str_to_sentence(bigram)
)
plot_29_shift <- cop29_in_cop28 %>%
select(bigram, src_per10k, tgt_per10k) %>%
pivot_longer(c(src_per10k, tgt_per10k), names_to = "where", values_to = "per10k") %>%
mutate(
COP = recode(where,
src_per10k = "COP29 (original corpus)",
tgt_per10k = "COP28 (appearance of COP29 terms)"),
bigram = str_to_sentence(bigram)
)
relevant_bigrams <- c(
"renewable share", "natural gas", "coal power",
"coal production", "coal electricity", "coal based",
"buying oil", "blocking progress",
"developing nation", "developing country", "developing world",
"greenhouse gas", "gas emissions", "russian gas",
"oil drilling", "burn coal", "expanding coal",
"accelerator fund"
)
diverging_data <- bigram_freq %>%
filter(bigram %in% relevant_bigrams) %>%
select(session, bigram, freq_per10k) %>%
pivot_wider(names_from = session, values_from = freq_per10k, values_fill = 0) %>%
mutate(
change = COP29 - COP28,
direction = if_else(change > 0,
"More prominent in COP29",
"More prominent in COP28"),
bigram = str_to_sentence(bigram)
)
comment_sentiment <- all_comments %>%
mutate(id = row_number()) %>%
unnest_tokens(word, comment_content) %>%
left_join(get_sentiments("bing"), by = "word") %>%
group_by(session, id) %>%
summarise(
pos = sum(sentiment == "positive", na.rm = TRUE),
neg = sum(sentiment == "negative", na.rm = TRUE),
.groups = "drop"
) %>%
mutate(label = case_when(
pos > neg ~ "positive",
neg > pos ~ "negative",
TRUE ~ "neutral"
))
sentiment_all <- comment_sentiment %>%
count(session, label) %>%
group_by(session) %>%
mutate(proportion = n / sum(n)) %>%
ungroup() %>%
rename(sentiment = label) %>%
mutate(
sentiment = str_to_sentence(sentiment),
sentiment = factor(sentiment, levels = c("Negative", "Neutral", "Positive"))
)
nrc_filtered <- get_sentiments("nrc") %>%
filter(!sentiment %in% c("positive", "negative"))
nrc_scores <- all_comments %>%
unnest_tokens(word, comment_content) %>%
inner_join(nrc_filtered, by = "word", relationship = "many-to-many") %>%
count(session, sentiment) %>%
group_by(session) %>%
mutate(proportion = n / sum(n)) %>%
ungroup() %>%
mutate(sentiment = str_to_sentence(sentiment))
twophrases <- top_bigrams_plot %>%
ggplot(aes(x = n, y = bigram, fill = session)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ session, scales = "free_y") +
scale_y_reordered() +
scale_fill_manual(values = c("COP28" = "#298c8c", "COP29" = "#9fc8c8")) +
labs(x = "Frequency", y = "Two Word Policy Phrases") +
theme_minimal(base_family = "Times New Roman", base_size = 12) +
theme(
axis.title.y = element_text(size = 13, margin = margin(r = 15)),
axis.title.x = element_text(size = 13, margin = margin(t = 12)),
axis.text = element_text(size = 12),
panel.spacing.x = unit(1.5, "lines"),
strip.text = element_text(size = 12),
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
plot.margin = margin(10, 15, 10, 10)
)
twophrases
ggsave(twophrases,
filename = "~/Desktop/Plots_thesis_20260424/twowordpolicyphrases.png",
width = 9, height = 6, dpi = 300)
cop_year_colors <- c("COP28" = "#9fc8c8", "COP29" = "#298c8c")
distinct10 <- ggplot(top_policy_phrases_top10,
aes(x = tf_idf,
y = reorder_within(bigram, tf_idf, conf),
fill = conf)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ conf, scales = "free_y") +
scale_y_reordered() +
scale_fill_manual(values = cop_year_colors) +
labs(x = "TF-IDF (distinctiveness by conference)", y = "Policy Concept") +
theme_minimal(base_family = "Times New Roman", base_size = 12) +
theme(
axis.title.y = element_text(size = 13, margin = margin(r = 15)),
axis.title.x = element_text(size = 13, margin = margin(t = 12)),
axis.text = element_text(size = 12),
panel.spacing.x = unit(1.5, "lines"),
strip.text = element_text(size = 12),
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
plot.margin = margin(10, 15, 10, 10)
)
distinct10
ggsave(distinct10,
filename = "~/Desktop/Plots_thesis_20260424/distinct10.png",
width = 10, height = 7, dpi = 300)
phrases29 <- ggplot(plot_29_shift,
aes(x = per10k,
y = reorder(bigram, per10k, FUN = max),
fill = COP)) +
geom_col(position = position_dodge(width = 0.75), width = 0.7) +
scale_fill_manual(values = c(
"COP29 (original corpus)" = "#298c8c",
"COP28 (appearance of COP29 terms)" = "#9fc8c8"
)) +
labs(x = "Frequency (per 10k bigrams)", y = "Distinctive Phrases") +
theme_minimal(base_family = "Times New Roman", base_size = 12) +
theme(
axis.title.y = element_text(size = 13, margin = margin(r = 15)),
axis.title.x = element_text(size = 13, margin = margin(t = 12)),
axis.text = element_text(size = 12),
panel.spacing.x = unit(1.5, "lines"),
strip.text = element_text(size = 12),
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
plot.margin = margin(10, 15, 10, 10)
)
phrases29
ggsave(phrases29,
filename = "~/Desktop/Plots_thesis_20260424/Phrases29in28.png",
width = 9, height = 6, dpi = 300)
phrases28 <- ggplot(plot_28_shift,
aes(x = per10k,
y = reorder(bigram, per10k, FUN = max),
fill = COP)) +
geom_col(position = position_dodge(width = 0.75), width = 0.7) +
scale_fill_manual(values = c(
"COP28 (original corpus)" = "#298c8c",
"COP29 (appearance of COP28 terms)" = "#9fc8c8"
)) +
labs(x = "Frequency (per 10k bigrams)", y = "Distinctive Phrases") +
theme_minimal(base_family = "Times New Roman", base_size = 12) +
theme(
axis.title.y = element_text(size = 13, margin = margin(r = 15)),
axis.title.x = element_text(size = 13, margin = margin(t = 12)),
axis.text = element_text(size = 12),
panel.spacing.x = unit(1.5, "lines"),
strip.text = element_text(size = 12),
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
plot.margin = margin(10, 15, 10, 10)
)
phrases28
ggsave(phrases28,
filename = "~/Desktop/Plots_thesis_20260424/Phrases28in29.png",
width = 9, height = 6, dpi = 300)
shift_max <- ceiling(max(abs(diverging_data$change), na.rm = TRUE) / 2) * 2
shift <- ggplot(diverging_data,
aes(x = change,
y = reorder(bigram, change),
fill = direction)) +
geom_col(width = 0.7) +
geom_vline(xintercept = 0, linewidth = 0.8, color = "grey30") +
scale_fill_manual(values = c(
"More prominent in COP29" = "#298c8c",
"More prominent in COP28" = "#9fc8c8"
)) +
scale_x_continuous(
limits = c(-30, 50),
breaks = seq(-30, 50, by = 10),
expand = c(0, 0)
) +
labs(
x = "Change in Frequency (per 10,000 bigrams)",
y = "Bigram Phrase",
fill = NULL
) +
theme_classic(base_family = "Times New Roman", base_size = 12) +
theme(
axis.title.y = element_text(size = 13, margin = margin(r = 15)),
axis.title.x = element_text(size = 13, margin = margin(t = 12)),
axis.text = element_text(size = 12),
axis.line.x = element_line(color = "grey30", linewidth = 0.5),
axis.ticks.x = element_line(color = "grey30"),
axis.ticks.length = unit(0.25, "cm"),
panel.grid = element_blank(),
plot.margin = margin(10, 20, 10, 10),
legend.position = "bottom",
legend.text = element_text(size = 12)
)
shift
ggsave(shift,
filename = "~/Desktop/Plots_thesis_20260424/shift.png",
width = 9, height = 7, dpi = 300)
Sentiment <- ggplot(sentiment_all,
aes(x = sentiment, y = proportion, fill = session)) +
geom_col(position = "dodge", width = 0.6) +
scale_fill_manual(values = c("COP28" = "#298c8c", "COP29" = "#9fc8c8")) +
scale_y_continuous(
labels = label_percent(suffix = ""),
limits = c(0, 0.55),
expand = expansion(mult = c(0, 0.05))
) +
labs(
x = "Sentiment Category",
y = "Proportion of Comments (Percent)",
fill = "COP"
) +
theme_minimal(base_family = "Times New Roman", base_size = 12) +
theme(
axis.title.y = element_text(size = 13, margin = margin(r = 15)),
axis.title.x = element_text(size = 13, margin = margin(t = 12)),
axis.text = element_text(size = 12),
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
plot.margin = margin(10, 15, 10, 10),
aspect.ratio = 1.8
)
Sentiment
ggsave(Sentiment,
filename = "~/Desktop/Plots_thesis_20260424/Sentimentdistribution.png",
width = 8, height = 6, dpi = 300)
emotion <- ggplot(nrc_scores,
aes(y = reorder(sentiment, proportion, FUN = max),
x = proportion,
fill = session)) +
geom_col(position = position_dodge(width = 0.8), width = 0.75) +
scale_fill_manual(values = c("COP28" = "#298c8c", "COP29" = "#9fc8c8")) +
scale_x_continuous(
labels = label_percent(suffix = ""),
limits = c(0, 0.25),
expand = expansion(mult = c(0, 0.05))
) +
labs(
x = "Proportion of detected emotion words (Percent)",
y = "NRC Emotion",
fill = "COP"
) +
theme_minimal(base_family = "Times New Roman", base_size = 12) +
theme(
axis.title.y = element_text(size = 13, margin = margin(r = 15)),
axis.title.x = element_text(size = 13, margin = margin(t = 12)),
axis.text = element_text(size = 12),
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.major.x = element_line(color = "grey90"),
plot.margin = margin(10, 15, 10, 10),
legend.position = "right",
legend.title = element_text(face = "bold")
)
emotion
ggsave(emotion,
filename = "~/Desktop/Plots_thesis_20260424/emotion.png",
width = 9, height = 6, dpi = 300)