# librerias
library(qdapRegex)
library(sentimentr)
library(syuzhet)
library(lubridate)
library(tidyverse)
library(tidytext)
library(forcats)
library(tokenizers)
library(widyr)
library(igraph)
library(ggraph)
library(topicmodels)
library(forcats)
library(scales)
library(stringr)
library(extrafont)
library(reldist)
library(grid)
library(gridExtra)
library(boot)
library(ggradar)
library(tidygraph)
library(boot)
# Sys.getenv()
Sys.setenv(JAVA_HOME = "C:\\Program Files\\Java\\jre1.8.0_251")
library(tm)
library(qdap)
# funciones auxiliares
source(here::here("scripts", "funciones_auxiliares.R"))
# paleta
plots_palette <- c("#ad5d51", "grey55", "#2b559e", "#947240")
# orden comunidades
comunidad_order <- c("GOP", "Independent", "DNC", "Progressives")
Carga de datos. Estos datos han sido previamente procesados, sobre todo en lo que se refiere al texto de los tweets. Se excluyen términos relacionados con la pandemia que provoca la enfermedad COVID-19 (en el texto sin limpiar, hay un número amplio de términos para referirse a la enfermedad. Se han unificado todos en el término COVID19).
Se ha recogido también información sobre los seguidores de cuentas para definir 4 comunidades
education <- readRDS(here::here("datos_procesados", "formated_text_df_data", "education.rds")) %>%
filter(str_detect(text, "#?covid19|#?COVID|#?[Pp]andemic|#trumpvirus", negate = T)) %>%
mutate(comunidad = fct_relevel(comunidad, c("GOP", "Independent", "DNC", "Progressives")),
user_id = as.character(user_id))
education %>%
count(comunidad) %>%
mutate(f = percent(n/sum(n), accuracy = 0.1))
## # A tibble: 4 x 3
## comunidad n f
## <fct> <int> <chr>
## 1 GOP 14375 4.0%
## 2 Independent 315897 89.0%
## 3 DNC 14928 4.2%
## 4 Progressives 9858 2.8%
freq_tweets_usuario <- get_tweet_distribution(education)
freq_tweets_usuario %>%
arrange(desc(tweets)) %>%
mutate(tweets_acum = cumsum(tweets),
usuarios_acum = cumsum(usuarios)) %>%
mutate(por_tweets_acum = tweets_acum / sum(tweets),
por_usuarios_acum = usuarios_acum / sum(usuarios)) %>%
mutate(int_users_acum = cut(por_usuarios_acum, breaks = seq(0, 1, by = 0.05))) %>%
group_by(comunidad, int_users_acum) %>%
summarise(porc_tweets = sum(porc_tweets)) %>%
mutate(porc_tweets_acum = cumsum(porc_tweets))
## # A tibble: 20 x 4
## # Groups: comunidad [4]
## comunidad int_users_acum porc_tweets porc_tweets_acum
## <fct> <fct> <dbl> <dbl>
## 1 GOP (0,0.05] 0.983 0.983
## 2 GOP (0.05,0.1] 0.00673 0.990
## 3 GOP (0.1,0.15] 0.00505 0.995
## 4 GOP (0.25,0.3] 0.00337 0.998
## 5 GOP (0.95,1] 0.00168 1
## 6 Independent (0,0.05] 0.999 0.999
## 7 Independent (0.05,0.1] 0.000246 1.00
## 8 Independent (0.1,0.15] 0.000184 1.00
## 9 Independent (0.2,0.25] 0.000123 1.00
## 10 Independent (0.95,1] 0.0000614 1
## 11 DNC (0,0.05] 0.987 0.987
## 12 DNC (0.05,0.1] 0.00512 0.992
## 13 DNC (0.1,0.15] 0.00384 0.996
## 14 DNC (0.25,0.3] 0.00256 0.999
## 15 DNC (0.95,1] 0.00128 1
## 16 Progressives (0,0.05] 0.993 0.993
## 17 Progressives (0.05,0.1] 0.00272 0.996
## 18 Progressives (0.1,0.15] 0.00204 0.998
## 19 Progressives (0.3,0.35] 0.00136 0.999
## 20 Progressives (0.95,1] 0.000681 1.00
# Curva de lorenz
freq_tweets_usuario %>%
ggplot(aes(x = porc_usuarios_acum, y = porc_tweets_acum)) +
geom_line(aes(color = comunidad), size = 1.3) +
scale_color_manual(values = plots_palette)
# gini
gini_comunidades <- freq_tweets_usuario %>%
group_by(comunidad) %>%
summarise(gini = gini(x = tweets, weights = porc_usuarios)) %>%
arrange(desc(gini))
Sentimiento de confianza en todos los grupos. Sin diferencias
nrc_education <- education_unigram %>%
filter(comunidad!= "Independent") %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
filter(!sentiment %in% c("positive", "negative")) %>%
count(comunidad, sentiment, sort = T) %>%
mutate(sentiment = str_to_title(sentiment)) %>%
ungroup() %>%
group_by(comunidad) %>%
mutate(n = rescale(n)) %>%
pivot_wider(names_from = sentiment, values_from = n) %>%
select(comunidad, Anger, Anticipation, Disgust, Fear, Joy, Sadness, Surprise, Trust)
# predomina miedo
ggradar(nrc_education %>% ungroup() %>% mutate(comunidad = fct_drop(comunidad)), group.colours = plots_palette[c(1, 3, 4)], legend.position = "top",
group.point.size = 4,
plot.title = "Sentimientos expresados hacia el tópico 'education Change'", legend.text.size = 11) +
theme(plot.title = element_text(size = 12, family = "Georgia", color = "grey55"),
axis.title = element_text(family = "Georgia"))
Hay un nivel de polarización significativo entre el partído demócrata y el movimiento progresista, pero no muy elevado.
library(boot)
pol_test_dif <- readRDS(here::here("datos_procesados", "formated_polarization_df_data", "polarization_testing", "education_polarization.rds"))
pol_test_dif[-1]
## $IC
## 2.5% 97.5%
## DNC-GOP -0.00308642 0.041666667
## DNC-PRG -0.02103996 0.004781421
##
## $resultado
## [1] "No se encuentran diferencias en medias"
# Nivel de polarizacion mínimo en ambos casos pero se hallan diferencias significativas
pol_test_dif$data %>%
ggplot(aes(x = sentimiento)) +
geom_histogram(aes(group = comunidad, fill = comunidad), color = "white", alpha = 0.5, binwidth = 0.003, position = "identity") +
scale_fill_manual(values = c(plots_palette[1], plots_palette[4])) +
labs(title = "Replicaciones bootstrap del scoring de polarización", x = "Average polarization score")