# librerias
library(qdapRegex)
library(sentimentr)
library(syuzhet)
library(lubridate)
library(tidyverse)
library(tidytext)
library(forcats)
library(tokenizers)
library(widyr)
library(igraph)
library(ggraph)
library(topicmodels)
library(forcats)
library(scales)
library(stringr)
library(extrafont)
library(reldist)
library(grid)
library(gridExtra)
library(boot)
library(ggradar)
library(tidygraph)
library(boot)
library(sp)
library(spData)
library(ggmap)
library(rgdal)
library(rgeos)
library(tmap)
# Sys.getenv()
Sys.setenv(JAVA_HOME = "C:\\Program Files\\Java\\jre1.8.0_251")
library(tm)
library(qdap)
# funciones auxiliares
source(here::here("scripts", "funciones_auxiliares.R"))
# paleta
plots_palette <- c("#ad5d51", "grey55", "#2b559e", "#947240")
# orden comunidades
comunidad_order <- c("GOP", "Independent", "DNC", "Progressives")
Carga de datos. Estos datos han sido previamente procesados, sobre todo en lo que se refiere al texto de los tweets. Se excluyen términos relacionados con la pandemia que provoca la enfermedad COVID-19 (en el texto sin limpiar, hay un número amplio de términos para referirse a la enfermedad. Se han unificado todos en el término COVID19).
Se ha recogido también información sobre los seguidores de cuentas para definir 4 comunidades
sanders <- readRDS(here::here("datos_procesados", "formated_text_df_data", "bernie_no_sent.rds")) %>%
filter(str_detect(text, "#?covid19|#?COVID|#?[Pp]andemic|#trumpvirus", negate = T)) %>%
mutate(comunidad = fct_relevel(comunidad, c("GOP", "Independent", "DNC", "Progressives")),
user_id = as.character(user_id))
library(tidyverse)
library(scales)
library(reldist)
library(qdapRegex)
library(sentimentr)
library(syuzhet)
library(lubridate)
library(tidyverse)
library(tidytext)
library(forcats)
library(tokenizers)
library(widyr)
library(igraph)
library(ggraph)
library(topicmodels)
library(forcats)
library(scales)
library(stringr)
library(extrafont)
library(reldist)
library(grid)
library(gridExtra)
library(boot)
library(ggradar)
library(tidygraph)
library(boot)
library(sp)
library(spData)
library(ggmap)
library(rgdal)
library(rgeos)
library(tmap)
sanders %>%
count(comunidad) %>%
mutate(f = percent(n/sum(n), accuracy = 0.1))
## # A tibble: 4 x 3
## comunidad n f
## <fct> <int> <chr>
## 1 GOP 16768 9.0%
## 2 Independent 115869 61.9%
## 3 DNC 15314 8.2%
## 4 Progressives 39229 21.0%
freq_tweets_usuario <- get_tweet_distribution(sanders)
freq_tweets_usuario %>%
arrange(desc(tweets)) %>%
mutate(tweets_acum = cumsum(tweets),
usuarios_acum = cumsum(usuarios)) %>%
mutate(por_tweets_acum = tweets_acum / sum(tweets),
por_usuarios_acum = usuarios_acum / sum(usuarios)) %>%
mutate(int_users_acum = cut(por_usuarios_acum, breaks = seq(0, 1, by = 0.05))) %>%
group_by(comunidad, int_users_acum) %>%
summarise(porc_tweets = sum(porc_tweets)) %>%
mutate(porc_tweets_acum = cumsum(porc_tweets))
## # A tibble: 26 x 4
## # Groups: comunidad [4]
## comunidad int_users_acum porc_tweets porc_tweets_acum
## <fct> <fct> <dbl> <dbl>
## 1 GOP (0,0.05] 0.994 0.994
## 2 GOP (0.05,0.1] 0.00367 0.998
## 3 GOP (0.1,0.15] 0.00122 0.999
## 4 GOP (0.25,0.3] 0.000815 1.00
## 5 GOP (0.95,1] 0.000407 1.00
## 6 Independent (0,0.05] 0.997 0.997
## 7 Independent (0.05,0.1] 0.00173 0.999
## 8 Independent (0.1,0.15] 0.000384 0.999
## 9 Independent (0.15,0.2] 0.000288 1.00
## 10 Independent (0.25,0.3] 0.000192 1.00
## # ... with 16 more rows
# Curva de lorenz
freq_tweets_usuario %>%
ggplot(aes(x = porc_usuarios_acum, y = porc_tweets_acum)) +
geom_line(aes(color = comunidad), size = 1.3) +
scale_color_manual(values = plots_palette)
# gini
gini_comunidades <- freq_tweets_usuario %>%
group_by(comunidad) %>%
summarise(gini = gini(x = tweets, weights = porc_usuarios)) %>%
arrange(desc(gini))
sanders_tfidf_mentions <- get_tfidf(sanders_unigram %>% filter(str_detect(word, "^@"),comunidad!= "Independent") %>% get_frequencies())
plot_tfidf(sanders_tfidf_mentions, cols = 3, colores = c(1, 3, 4)) +
labs(title = "Tópicos característicos de cada comunidad", y = "TF-IDF") +
guides(fill = F) +
theme(axis.title.x= element_blank(),
axis.text.y = element_text(size = 16),
legend.title = element_blank(),
strip.text = element_text(size = 15))
# limpeza de datos: se elimina trump y sanders
sanders_for_sent <- sanders %>%
filter(str_detect(text, "#?([Jj]oe|[Bb]iden|[Dd]onald|[Tt]rump)", negate = T))
# adicion de sentimientos
sent_sanders <- sentiment(sanders_for_sent$text)
sanders_for_sent <- sanders_for_sent %>%
mutate(ave_sentiment = sent_sanders$sentiment)
sanders_for_sent %>%
group_by(comunidad) %>%
summarise(sent = mean(ave_sentiment))
## # A tibble: 3 x 2
## comunidad sent
## <fct> <dbl>
## 1 GOP -0.0185
## 2 DNC 0.00529
## 3 Progressives 0.0227
sanders_for_sent %>%
ggplot(aes(x = comunidad, y = ave_sentiment)) +
geom_boxplot(aes(fill = comunidad)) +
scale_fill_manual(values = plots_palette[c(1, 3, 4)])
# DNC - PRG
test_sent_dnc_prg <- get_test_dif(sanders_for_sent, group = c("DNC", "Progressives"))
test_sent_dnc_prg[-1]
## $IC
## 2.5% 97.5%
## DNC 0.0002348595 0.01038558
## Progressives 0.0195764564 0.02586789
##
## $resultado
## [1] "Diferencia significativa en medias"
# DNC - GOP
test_sent_dnc_gop <- get_test_dif(sanders_for_sent, group = c("DNC", "GOP"))
test_sent_dnc_gop[-1]
## $IC
## 2.5% 97.5%
## DNC 0.0002132103 0.01028844
## GOP -0.0228184211 -0.01425693
##
## $resultado
## [1] "Diferencia significativa en medias"
sent_test_data <- bind_rows(test_sent_dnc_prg$data, test_sent_dnc_gop$data)
test_sent_dnc_prg$data %>%
ggplot(aes(x = sentimiento)) +
geom_histogram(aes(fill = comunidad), color = "white", sanwidth = 0.0005) +
scale_fill_manual(values = plots_palette[c(3, 4)])
# geografia
sanders_for_sent <- sanders_for_sent %>%
bind_cols(map_df(.$place_full_name, ~get_state_code(.x)))
sanders_sent_states <- sanders_for_sent %>%
filter(comunidad %in% c("DNC", "Progressives")) %>%
group_by(comunidad, State) %>%
filter(str_detect(State, "[A-Z]{2}")) %>%
summarise(sent = mean(ave_sentiment),
n = n()) %>%
filter(n > 3)
state_names <- character(length = length(sanders_sent_states$State))
for (idx in 1:length(sanders_sent_states$State)) {
state_code <- sanders_sent_states$State[idx]
s_name <- state.name[which(state.abb == state_code)]
if (length(s_name) == 0)
{
print(state_code)
state_names[idx] <- NA
} else {
state_names[idx] <- s_name
}
}
## [1] "DC"
## [1] "DC"
sanders_sent_states$state_name <- state_names
sanders_sent_states <- sanders_sent_states %>%
filter(!is.na(state_name)) %>%
select(-n) %>%
pivot_wider(names_from = comunidad, values_from = sent) %>%
mutate(dif_sent = DNC - Progressives)
data(us_states)
us_states <- us_states %>%
left_join(sanders_sent_states, by = c("NAME" = "state_name"))
tm_shape(us_states) +
tm_borders(col = "grey60", lwd = 0.5) +
tm_fill(col = "dif_sent") +
tm_text(text = "State", fontfamily = "Georgia", col = "grey55") +
tm_layout(title = "Diferencia de sentimiento entre Demócratas y Progresistas")
library(scales)
nrc_sanders <- sanders_for_sent %>%
get_unigrams() %>%
filter(comunidad!= "Independent") %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
filter(!sentiment %in% c("positive", "negative")) %>%
count(comunidad, sentiment, sort = T) %>%
mutate(sentiment = str_to_title(sentiment)) %>%
ungroup() %>%
group_by(comunidad) %>%
mutate(n = scales::rescale(n)) %>%
pivot_wider(names_from = sentiment, values_from = n) %>%
select(comunidad, Joy, Trust, Fear, Surprise, Sadness, Disgust, Anger, Anticipation)
# predomina miedo
# el grafico se parece más entre Progresitas y Republicanos que entre demócratas y Republicanos
ggradar(nrc_sanders %>% ungroup() %>% mutate(comunidad = fct_drop(comunidad)), group.colours = plots_palette[c(1, 3, 4)], legend.position = "top",
group.point.size = 4,
plot.title = "Sentimientos expresados hacia el tópico 'sanders Change'", legend.text.size = 11) +
theme(plot.title = element_text(size = 12, family = "Georgia", color = "grey55"),
axis.title = element_text(family = "Georgia"))
Hay un nivel significativo de polaridad entre en la comunidad demócrata y progresista hacia el senador Bernie Sanders
pol_test_dif <- readRDS(here::here("datos_procesados", "formated_polarization_df_data", "polarization_testing", "sanders_polarization.rds"))
pol_test_dif[-1]
## $IC
## 2.5% 97.5%
## DNC-GOP 0.05765027 0.1169399
## DNC-PRG 0.15299160 0.1689503
##
## $resultado
## [1] "Diferencia significativa en medias"
# Nivel de polarizacion mínimo en ambos casos pero se hallan diferencias significativas
pol_test_dif$data %>%
ggplot(aes(x = sentimiento)) +
geom_histogram(aes(group = comunidad, fill = comunidad), color = "white", alpha = 0.5, binwidth = 0.002, position = "identity") +
scale_fill_manual(values = c(plots_palette[1], plots_palette[4])) +
labs(title = "Replicaciones bootstrap de la media del scoring de polarización para los nodos frontera",
x = "Polarization Score")