# librerias
library(qdapRegex)
library(sentimentr)
library(syuzhet)
library(lubridate)
library(tidyverse)
library(tidytext)
library(forcats)
library(tokenizers)
library(widyr)
library(igraph)
library(ggraph)
library(topicmodels)
library(forcats)
library(scales)
library(stringr)
library(extrafont)
library(reldist)
library(grid)
library(gridExtra)
library(boot)
library(ggradar)
library(tidygraph)
library(boot)
library(sp)
library(spData)
library(ggmap)
library(rgdal)
library(rgeos)
library(tmap)
# Sys.getenv()
Sys.setenv(JAVA_HOME = "C:\\Program Files\\Java\\jre1.8.0_251")
library(tm)
library(qdap)
# funciones auxiliares
source(here::here("scripts", "funciones_auxiliares.R"))
# paleta
plots_palette <- c("#ad5d51", "grey55", "#2b559e", "#947240")
# orden comunidades
comunidad_order <- c("GOP", "Independent", "DNC", "Progressives")
Estos datos han sido previamente procesados, sobre todo en lo que se refiere al texto de los tweets. Se excluyen términos relacionados con la pandemia que provoca la enfermedad COVID-19 (en el texto sin limpiar, hay un número amplio de términos para referirse a la enfermedad. Se han unificado todos en el término COVID19).
Se ha recogido también información sobre los seguidores de cuentas para definir 4 comunidades
biden1 <- readRDS(here::here("datos_procesados", "formated_text_df_data", "biden_no_sent_part1.rds")) %>%
filter(str_detect(text, "#?covid19|#?COVID|#?[Pp]andemic|#trumpvirus", negate = T)) %>%
mutate(comunidad = fct_relevel(comunidad, c("GOP", "Independent", "DNC", "Progressives")),
user_id = as.character(user_id))
biden2 <- readRDS(here::here("datos_procesados", "formated_text_df_data", "biden_no_sent_part2.rds")) %>%
filter(str_detect(text, "#?covid19|#?COVID|#?[Pp]andemic|#trumpvirus", negate = T)) %>%
mutate(comunidad = fct_relevel(comunidad, c("GOP", "Independent", "DNC", "Progressives")),
user_id = as.character(user_id))
biden <- bind_rows(biden1, biden2) %>%
distinct(status_id, .keep_all = T)
rm(biden1, biden2)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 5160515 275.7 9658325 515.9 5177089 276.5
## Vcells 32200147 245.7 52265405 398.8 50463096 385.1
library(tidyverse)
library(scales)
library(reldist)
library(qdapRegex)
library(sentimentr)
library(syuzhet)
library(lubridate)
library(tidyverse)
library(tidytext)
library(forcats)
library(tokenizers)
library(widyr)
library(igraph)
library(ggraph)
library(topicmodels)
library(forcats)
library(scales)
library(stringr)
library(extrafont)
library(reldist)
library(grid)
library(gridExtra)
library(boot)
library(ggradar)
library(tidygraph)
library(boot)
library(sp)
library(spData)
library(ggmap)
library(rgdal)
library(rgeos)
library(tmap)
biden %>%
count(comunidad) %>%
mutate(f = percent(n/sum(n), accuracy = 0.1))
## # A tibble: 4 x 3
## comunidad n f
## <fct> <int> <chr>
## 1 GOP 127833 18.9%
## 2 Independent 417877 61.8%
## 3 DNC 76078 11.2%
## 4 Progressives 54886 8.1%
freq_tweets_usuario <- get_tweet_distribution(biden)
freq_tweets_usuario %>%
arrange(desc(tweets)) %>%
mutate(tweets_acum = cumsum(tweets),
usuarios_acum = cumsum(usuarios)) %>%
mutate(por_tweets_acum = tweets_acum / sum(tweets),
por_usuarios_acum = usuarios_acum / sum(usuarios)) %>%
mutate(int_users_acum = cut(por_usuarios_acum, breaks = seq(0, 1, by = 0.05))) %>%
group_by(comunidad, int_users_acum) %>%
summarise(porc_tweets = sum(porc_tweets)) %>%
mutate(porc_tweets_acum = cumsum(porc_tweets))
## # A tibble: 33 x 4
## # Groups: comunidad [4]
## comunidad int_users_acum porc_tweets porc_tweets_acum
## <fct> <fct> <dbl> <dbl>
## 1 GOP (0,0.05] 0.985 0.985
## 2 GOP (0.05,0.1] 0.00913 0.994
## 3 GOP (0.1,0.15] 0.00249 0.997
## 4 GOP (0.15,0.2] 0.000996 0.998
## 5 GOP (0.2,0.25] 0.000830 0.998
## 6 GOP (0.25,0.3] 0.000664 0.999
## 7 GOP (0.35,0.4] 0.000498 1.00
## 8 GOP (0.55,0.6] 0.000332 1.00
## 9 GOP (0.95,1] 0.000166 1
## 10 Independent (0,0.05] 0.997 0.997
## # ... with 23 more rows
# Curva de lorenz
freq_tweets_usuario %>%
ggplot(aes(x = porc_usuarios_acum, y = porc_tweets_acum)) +
geom_line(aes(color = comunidad), size = 1.3) +
scale_color_manual(values = plots_palette)
# gini
gini_comunidades <- freq_tweets_usuario %>%
group_by(comunidad) %>%
summarise(gini = gini(x = tweets, weights = porc_usuarios)) %>%
arrange(desc(gini))
biden_tfidf_mentions <- get_tfidf(biden_unigram %>% filter(str_detect(word, "^@"),comunidad!= "Independent") %>% get_frequencies())
plot_tfidf(biden_tfidf_mentions, cols = 3, colores = c(1, 3, 4)) +
labs(title = "Menciones característicos de cada comunidad", y = "TF-IDF") +
guides(fill = F) +
theme(axis.title.x= element_blank(),
axis.text.y = element_text(size = 16),
legend.title = element_blank(),
strip.text = element_text(size = 15))
Hay una percepción significativamente positiva por parte de los demócratas hacia Joe Biden, mientras que los progresistas tienen una percepción significativamente negativa.
Hay también una divergencia geográfica en cuanto a la percepción del candidato.
# limpeza de datos: se elimina trump y sanders
biden_for_sent <- biden %>%
filter(str_detect(text, "#?([Bb]ernie|[Ss]anders|[Dd]onald|[Tt]rump)", negate = T))
# adicion de sentimientos
sent_biden <- sentiment(biden_for_sent$text)
biden_for_sent <- biden_for_sent %>%
mutate(ave_sentiment = sent_biden$sentiment)
biden_for_sent %>%
group_by(comunidad) %>%
summarise(sent = mean(ave_sentiment))
## # A tibble: 3 x 2
## comunidad sent
## <fct> <dbl>
## 1 GOP -0.0497
## 2 DNC 0.0307
## 3 Progressives -0.0229
biden_for_sent %>%
ggplot(aes(x = comunidad, y = ave_sentiment)) +
geom_boxplot(aes(fill = comunidad)) +
scale_fill_manual(values = plots_palette[c(1, 3, 4)])
# DNC - PRG
test_sent_dnc_prg <- get_test_dif(biden_for_sent, group = c("DNC", "Progressives"))
test_sent_dnc_prg[-1]
## $IC
## 2.5% 97.5%
## DNC 0.02818961 0.03312151
## Progressives -0.02548160 -0.02029681
##
## $resultado
## [1] "Diferencia significativa en medias"
# DNC - GOP
test_sent_dnc_gop <- get_test_dif(biden_for_sent, group = c("DNC", "GOP"))
test_sent_dnc_gop[-1]
## $IC
## 2.5% 97.5%
## DNC 0.02820353 0.03314447
## GOP -0.05128617 -0.04816092
##
## $resultado
## [1] "Diferencia significativa en medias"
sent_test_data <- bind_rows(test_sent_dnc_prg$data, test_sent_dnc_gop$data)
test_sent_dnc_prg$data %>%
ggplot(aes(x = sentimiento)) +
geom_histogram(aes(fill = comunidad), color = "white", binwidth = 0.0005) +
scale_fill_manual(values = plots_palette[c(3, 4)])
# geografia
biden_for_sent <- biden_for_sent %>%
bind_cols(map_df(.$place_full_name, ~get_state_code(.x)))
biden_sent_states <- biden_for_sent %>%
filter(comunidad %in% c("DNC", "Progressives")) %>%
group_by(comunidad, State) %>%
filter(str_detect(State, "[A-Z]{2}")) %>%
summarise(sent = mean(ave_sentiment),
n = n()) %>%
filter(n > 3)
state_names <- character(length = length(biden_sent_states$State))
for (idx in 1:length(biden_sent_states$State)) {
state_code <- biden_sent_states$State[idx]
s_name <- state.name[which(state.abb == state_code)]
if (length(s_name) == 0)
{
print(state_code)
state_names[idx] <- NA
} else {
state_names[idx] <- s_name
}
}
## [1] "DC"
## [1] "DC"
biden_sent_states$state_name <- state_names
biden_sent_states <- biden_sent_states %>%
filter(!is.na(state_name)) %>%
select(-n) %>%
pivot_wider(names_from = comunidad, values_from = sent) %>%
mutate(dif_sent = DNC - Progressives)
data(us_states)
us_states <- us_states %>%
left_join(biden_sent_states, by = c("NAME" = "state_name"))
tm_shape(us_states) +
tm_borders(col = "grey60", lwd = 0.5) +
tm_fill(col = "dif_sent") +
tm_text(text = "State", fontfamily = "Georgia", col = "grey55") +
tm_layout(title = "Diferencia de sentimiento entre Demócratas y Progresistas")
Progresistas y Republicanos están más alineados en su sentimiento que Progresistas y demócratas
library(scales)
nrc_biden <- biden_for_sent %>% get_unigrams() %>%
filter(comunidad!= "Independent") %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
filter(!sentiment %in% c("positive", "negative")) %>%
count(comunidad, sentiment, sort = T) %>%
mutate(sentiment = str_to_title(sentiment)) %>%
ungroup() %>%
group_by(comunidad) %>%
mutate(n = scales::rescale(n)) %>%
pivot_wider(names_from = sentiment, values_from = n) %>%
select(comunidad, Joy, Trust, Fear, Surprise, Sadness, Disgust, Anger, Anticipation)
# el grafico se parece más entre Progresitas y Republicanos que entre demócratas y Republicanos
ggradar(nrc_biden %>% ungroup() %>% mutate(comunidad = fct_drop(comunidad)), group.colours = plots_palette[c(1, 3, 4)], legend.position = "top",
group.point.size = 4,
plot.title = "Sentimientos expresados hacia el tópico 'Joe Biden'", legend.text.size = 11) +
theme(plot.title = element_text(size = 12, family = "Georgia", color = "grey55"),
axis.title = element_text(family = "Georgia"))
Hay una clara polarízación entre demócratas y republicanos. Aunque hay un cierto nivel también entre demócratas y progresistas, este es mucho menor.
# lectura datos test polaridad
pol_test_dif <- readRDS(here::here("datos_procesados", "formated_polarization_df_data", "polarization_testing", "biden_polarization.rds"))
pol_test_dif[-1]
## $IC
## 2.5% 97.5%
## DNC-GOP 0.22911727 0.23889725
## DNC-PRG 0.01019636 0.01626453
##
## $resultado
## [1] "Diferencia significativa en medias"
# Nivel de polarizacion mínimo en ambos casos pero se hallan diferencias significativas
pol_test_dif$data %>%
ggplot(aes(x = sentimiento)) +
geom_histogram(aes(group = comunidad, fill = comunidad), color = "white", alpha = 0.5, binwidth = 0.003, position = "identity") +
scale_fill_manual(values = c(plots_palette[1], plots_palette[4])) +
labs(title = "Replicaciones bootstrap del scoring medio de polarización para los nodos frontera",
x = "Average polarization score") +
theme_bw()