Education - análisis exploratorio

# librerias
library(qdapRegex)
library(sentimentr)
library(syuzhet)
library(lubridate)
library(tidyverse)
library(tidytext)
library(forcats)
library(tokenizers)
library(widyr)
library(igraph)
library(ggraph)
library(topicmodels)
library(forcats)
library(scales)
library(stringr)
library(extrafont)
library(reldist)
library(grid)
library(gridExtra)
library(boot)
library(ggradar)
library(tidygraph)
library(boot)
# Sys.getenv()
Sys.setenv(JAVA_HOME = "C:\\Program Files\\Java\\jre1.8.0_251")

library(tm)
library(qdap)

# funciones auxiliares
source(here::here("scripts", "funciones_auxiliares.R"))

# paleta
plots_palette <- c("#ad5d51", "grey55", "#2b559e", "#947240")

# orden comunidades
comunidad_order <- c("GOP", "Independent", "DNC", "Progressives")

Distribucion de tweets

Carga de datos. Estos datos han sido previamente procesados, sobre todo en lo que se refiere al texto de los tweets. Se excluyen términos relacionados con la pandemia que provoca la enfermedad COVID-19 (en el texto sin limpiar, hay un número amplio de términos para referirse a la enfermedad. Se han unificado todos en el término COVID19).

Se ha recogido también información sobre los seguidores de cuentas para definir 4 comunidades

GOP: Seguidores de la cuenta del partido republicano, que no siguen la cuenta del partido demócrata ni del movimiento progresista
DNS: Seguidores de la cuenta del partido demócrata, que no siguen la cuenta del partido republicano ni del movimiento progresista
Progressives: Seguidores de la cuenta del movimiento progresista, que no siguen la cuenta del partido republicano ni del partido demócrata
Independent: usuarios de tweets recogidos que no se encuentran incluidos en ninguno de los casos anteriores

education <- readRDS(here::here("datos_procesados", "formated_text_df_data", "education.rds")) %>% 
  filter(str_detect(text, "#?covid19|#?COVID|#?[Pp]andemic|#trumpvirus", negate = T)) %>% 
  mutate(comunidad = fct_relevel(comunidad, c("GOP", "Independent", "DNC", "Progressives")),
         user_id = as.character(user_id))

education %>% 
  count(comunidad) %>% 
  mutate(f = percent(n/sum(n), accuracy = 0.1))

## # A tibble: 4 x 3
##   comunidad         n f    
##   <fct>         <int> <chr>
## 1 GOP           14375 4.0% 
## 2 Independent  315897 89.0%
## 3 DNC           14928 4.2% 
## 4 Progressives   9858 2.8%

freq_tweets_usuario <- get_tweet_distribution(education) 

freq_tweets_usuario %>% 
  arrange(desc(tweets)) %>% 
  mutate(tweets_acum = cumsum(tweets), 
         usuarios_acum = cumsum(usuarios)) %>% 
  mutate(por_tweets_acum = tweets_acum / sum(tweets), 
         por_usuarios_acum = usuarios_acum / sum(usuarios)) %>% 
  mutate(int_users_acum = cut(por_usuarios_acum, breaks = seq(0, 1, by = 0.05))) %>% 
  group_by(comunidad, int_users_acum) %>%  
  summarise(porc_tweets = sum(porc_tweets)) %>% 
  mutate(porc_tweets_acum = cumsum(porc_tweets))

## # A tibble: 20 x 4
## # Groups:   comunidad [4]
##    comunidad    int_users_acum porc_tweets porc_tweets_acum
##    <fct>        <fct>                <dbl>            <dbl>
##  1 GOP          (0,0.05]         0.983                0.983
##  2 GOP          (0.05,0.1]       0.00673              0.990
##  3 GOP          (0.1,0.15]       0.00505              0.995
##  4 GOP          (0.25,0.3]       0.00337              0.998
##  5 GOP          (0.95,1]         0.00168              1    
##  6 Independent  (0,0.05]         0.999                0.999
##  7 Independent  (0.05,0.1]       0.000246             1.00 
##  8 Independent  (0.1,0.15]       0.000184             1.00 
##  9 Independent  (0.2,0.25]       0.000123             1.00 
## 10 Independent  (0.95,1]         0.0000614            1    
## 11 DNC          (0,0.05]         0.987                0.987
## 12 DNC          (0.05,0.1]       0.00512              0.992
## 13 DNC          (0.1,0.15]       0.00384              0.996
## 14 DNC          (0.25,0.3]       0.00256              0.999
## 15 DNC          (0.95,1]         0.00128              1    
## 16 Progressives (0,0.05]         0.993                0.993
## 17 Progressives (0.05,0.1]       0.00272              0.996
## 18 Progressives (0.1,0.15]       0.00204              0.998
## 19 Progressives (0.3,0.35]       0.00136              0.999
## 20 Progressives (0.95,1]         0.000681             1.00

# Curva de lorenz
freq_tweets_usuario %>% 
  ggplot(aes(x = porc_usuarios_acum, y = porc_tweets_acum)) + 
  geom_line(aes(color = comunidad), size = 1.3) + 
  scale_color_manual(values = plots_palette)

# gini
gini_comunidades <- freq_tweets_usuario %>% 
  group_by(comunidad) %>% 
  summarise(gini = gini(x = tweets, weights = porc_usuarios)) %>% 
  arrange(desc(gini))

Analisis de Topic

Hashtags más usados

El partido republicano tiene más hashtags relacionados con el home schooling. Mientras que los demócratas se centran en educación a distancia. Destaca “Feedly” en los progresistas. Una herramienta de agregación de noticias. También se centran en temas reivindicativos relacionados con minorías y dsesigualdades sociales (LGTB, Taxtherich)

education <- education %>% 
  filter(comunidad != "Independent")

# datos tidy
education_unigram <- get_unigrams(education, c("#education", "education"))
education_hashtag <- get_hashtags(education_unigram)
education_frequency <- get_frequencies(education_hashtag) 



## Topics con networks ##

gop_education_net <- create_hashtag_graph(education_hashtag, "GOP", c("#education"), minim_n = 3, nodos_n = 20)

## Iniciacion data frame de links
## Obtencion de data frame con conteo de hashtag  por status_id
## Obtencion de data frame de frequencias de hasthags
## Obtencion del porcentade de usuarios del hashtag
## Filtro de datos por nodos mas frecuentes
## Obtencion de tweets unicos
## Inicio de loop para obtener data frame de links
## [1] "Nodo: 1  -  #homeschooling"
## [1] "Nodo: 2  -  #schoolchoice"
## [1] "Nodo: 3  -  #trump2020"
## [1] "Nodo: 4  -  #maga"
## [1] "Nodo: 5  -  #remotelearning"
## [1] "Nodo: 6  -  #schools"
## [1] "Nodo: 7  -  #jobs"
## [1] "Nodo: 8  -  #students"
## [1] "Nodo: 9  -  #technology"
## [1] "Nodo: 10  -  #txed"
## [1] "Nodo: 11  -  #lalege"
## [1] "Nodo: 12  -  #teachers"
## [1] "Nodo: 13  -  #commoncore"
## [1] "Nodo: 14  -  #learning"
## [1] "Nodo: 15  -  #school"
## [1] "Nodo: 16  -  #txlege"
## [1] "Nodo: 17  -  #obamagate"
## [1] "Nodo: 18  -  #wwg1wga"
## [1] "Nodo: 19  -  #college"
## [1] "Nodo: 20  -  #teacherappreciationweek"
## [1] "Nodo: 21  -  #utpol"
## Fin de loop
## Adicion de nodos sin links
## Wrangling final del data frame
## Devolucion del data frame

dnc_education_net <- create_hashtag_graph(education_hashtag, "DNC", c("#education"), minim_n = 3, nodos_n = 20)

## Iniciacion data frame de links
## Obtencion de data frame con conteo de hashtag  por status_id
## Obtencion de data frame de frequencias de hasthags
## Obtencion del porcentade de usuarios del hashtag
## Filtro de datos por nodos mas frecuentes
## Obtencion de tweets unicos
## Inicio de loop para obtener data frame de links
## [1] "Nodo: 1  -  #teacherappreciationweek"
## [1] "Nodo: 2  -  #healthcare"
## [1] "Nodo: 3  -  #teachers"
## [1] "Nodo: 4  -  #highered"
## [1] "Nodo: 5  -  #remotelearning"
## [1] "Nodo: 6  -  #edtech"
## [1] "Nodo: 7  -  #edchat"
## [1] "Nodo: 8  -  #democrats"
## [1] "Nodo: 9  -  #medicaid"
## [1] "Nodo: 10  -  #distancelearning"
## [1] "Nodo: 11  -  #gop"
## [1] "Nodo: 12  -  #learning"
## [1] "Nodo: 13  -  #fundourfuture"
## [1] "Nodo: 14  -  #stem"
## [1] "Nodo: 15  -  #onlinelearning"
## [1] "Nodo: 16  -  #homeschooling"
## [1] "Nodo: 17  -  #science"
## [1] "Nodo: 18  -  #school"
## [1] "Nodo: 19  -  #students"
## [1] "Nodo: 20  -  #trump"
## [1] "Nodo: 21  -  #wtp2020"
## Fin de loop
## Adicion de nodos sin links
## Wrangling final del data frame
## Devolucion del data frame

prg_education_net <- create_hashtag_graph(education_hashtag, "Progressives", c("#education"), minim_n = 3, nodos_n = 20)

## Iniciacion data frame de links
## Obtencion de data frame con conteo de hashtag  por status_id
## Obtencion de data frame de frequencias de hasthags
## Obtencion del porcentade de usuarios del hashtag
## Filtro de datos por nodos mas frecuentes
## Obtencion de tweets unicos
## Inicio de loop para obtener data frame de links
## [1] "Nodo: 1  -  #metoo"
## [1] "Nodo: 2  -  #voteblue"
## [1] "Nodo: 3  -  #medicareforall"
## [1] "Nodo: 4  -  #fundnyschools"
## [1] "Nodo: 5  -  #protectnyschools"
## [1] "Nodo: 6  -  #healthcare"
## [1] "Nodo: 7  -  #makebillionairespay"
## [1] "Nodo: 8  -  #redfored"
## [1] "Nodo: 9  -  #science"
## [1] "Nodo: 10  -  #highered"
## [1] "Nodo: 11  -  #teachers"
## [1] "Nodo: 12  -  #taxtherich"
## [1] "Nodo: 13  -  #teacherappreciationweek"
## [1] "Nodo: 14  -  #highereducation"
## [1] "Nodo: 15  -  #remotelearning"
## [1] "Nodo: 16  -  #wearetherevolution"
## [1] "Nodo: 17  -  #edchat"
## [1] "Nodo: 18  -  #mapoli"
## [1] "Nodo: 19  -  #wtp2020"
## [1] "Nodo: 20  -  #economy"
## [1] "Nodo: 21  -  #greennewdeal"
## [1] "Nodo: 22  -  #students"
## [1] "Nodo: 23  -  #wtp300"
## Fin de loop
## Adicion de nodos sin links
## Wrangling final del data frame
## Devolucion del data frame

education_graph <- bind_graphs(gop_education_net, dnc_education_net, prg_education_net) %>% 
  activate(nodes) %>% 
  mutate(comunidad = fct_relevel(comunidad, c("GOP", "DNC", "Progressives"))) %>% 
  activate(edges) %>% 
  filter(strength > 1) %>% 
  activate(nodes) %>% 
  mutate(degree = centrality_degree())

top_dnc <- education_frequency %>% filter(comunidad == "DNC") %>%  top_n(20, n) %>% .$word
top_prg <- education_frequency %>% filter(comunidad == "Progressives") %>% top_n(20, n) %>% .$word

length(intersect(top_dnc, top_prg))

## [1] 0

No hay ningún tema en común entre los top 20 temas comentados en el partido Demócrata y el movimiento progresista

plot_topic_networks(education_frequency, education_graph, filtered_words = "#education", 
                    texto_titulo = "Hashtags más relevantes", lab_size = 16, text_size = 5,  layout_style = "nicely", minim_n = 3)

Correlaciones entre hashtags

Hay un cluster en el partído demócrata relacionado con las elecciones y la campaña política. Hay otro cluster en el movimiento Progresista relacionado prinipalmente con las minorías

gop_cor_net <- get_correlation_network(education_hashtag, education_frequency, com = "GOP", exclude_words = "#education", cor_limit = 0.15) %>% 
  activate(nodes) %>% mutate(comunidad = "GOP")
dnc_cor_net <- get_correlation_network(education_hashtag, education_frequency, com = "DNC", exclude_words = "#education", cor_limit = 0.25, top_words = 15) %>% 
  activate(nodes) %>% mutate(comunidad = "DNC")
prg_cor_net <- get_correlation_network(education_hashtag, education_frequency, com = "Progressives", exclude_words = "#education", cor_limit = 0.25, top_words = 15) %>% 
  activate(nodes) %>% mutate(comunidad = "Progressives")

education_net <- bind_graphs(gop_cor_net, dnc_cor_net, prg_cor_net) %>% 
  activate(nodes) %>% 
  mutate(comunidad = fct_relevel(comunidad, c("GOP", "DNC", "Progressives")))

education_net %>% 
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(pch = 21, aes(fill = sent, size = n)) +
  geom_node_text( aes(label = name, color = comunidad), repel = TRUE, size = 5) +
  facet_nodes(~comunidad, ncol = 1, scales = "free") + 
  scale_fill_gradient2(low = "red", high = "green", mid = "grey") +
  scale_color_manual(values = plots_palette[c(1, 3, 4)]) + 
  facet_nodes(~comunidad, ncol = 1, scales = "free") +
  theme_graph() + 
  guides(color = F) + 
  theme(strip.text = element_text(size = 16))

Temas propios de comunidades

Hashtags

En los republicanos junto con temas de homeschooling aparecen algunas teorías conspirativas (#obamagate, #wakeupamerica). En los Demócratas y progresistas y demócratas abundan en una mayor proporción los temas que ya destacaban en los hashtagas más usados

education_tfidf_hashtags <- get_tfidf(education_frequency %>% filter(comunidad!= "Independent"))

plot_tfidf(education_tfidf_hashtags, cols = 3, colores = c(1, 3, 4)) + 
  labs(title = "Tópicos característicos de cada comunidad", y = "TF-IDF") +
  guides(fill = F) +
  theme(axis.title.x= element_blank(),
        axis.text.y = element_text(size = 16),
        legend.title = element_blank(),
        strip.text = element_text(size = 15))

El top De menciones para el partido republicano es @professorcrunk: una actista feminista https://read.macmillan.com/lp/eloquent-rage/ En el DNC es Stephaniue Ruhle: periodista de MSNBC: https://twitter.com/SRuhle En el movimiento progresista es AQENY . Aliance for qualiy education: unacoalición para la educacion publica de alta claidad en NY. https://www.aqeny.org/ https://twitter.com/AQE_NY Destacan también personalidads y canales políticos progresista en la esfera de Youtube como Kyle Kulinski y The Progressive Voice

education_tfidf_mentions <- get_tfidf(education_unigram %>% filter(str_detect(word, "^@"),comunidad!= "Independent") %>% get_frequencies())

plot_tfidf(education_tfidf_mentions, cols = 3, colores = c(1, 3, 4)) + 
  labs(title = "Tópicos característicos de cada comunidad", y = "TF-IDF") +
  guides(fill = F) +
  theme(axis.title.x= element_blank(),
        axis.text.y = element_text(size = 16),
        legend.title = element_blank(),
        strip.text = element_text(size = 15))

Sentimiento NRC

Sentimiento de confianza en todos los grupos. Sin diferencias

nrc_education <- education_unigram %>% 
  filter(comunidad!= "Independent") %>% 
  inner_join(get_sentiments("nrc"), by = "word") %>% 
  filter(!sentiment %in% c("positive", "negative")) %>% 
  count(comunidad, sentiment, sort = T) %>% 
  mutate(sentiment = str_to_title(sentiment)) %>% 
  ungroup() %>% 
  group_by(comunidad) %>% 
  mutate(n = rescale(n)) %>% 
  pivot_wider(names_from = sentiment, values_from = n) %>% 
  select(comunidad, Anger, Anticipation, Disgust, Fear, Joy, Sadness, Surprise, Trust)

# predomina miedo
ggradar(nrc_education %>% ungroup() %>% mutate(comunidad = fct_drop(comunidad)), group.colours = plots_palette[c(1, 3, 4)], legend.position = "top",
        group.point.size = 4,
        plot.title = "Sentimientos expresados hacia el tópico 'education Change'", legend.text.size = 11) + 
  theme(plot.title = element_text(size = 12, family = "Georgia", color = "grey55"),
        axis.title = element_text(family = "Georgia"))

Polaridad

Hay un nivel de polarización significativo entre el partído demócrata y el movimiento progresista, pero no muy elevado.

library(boot)
pol_test_dif <- readRDS(here::here("datos_procesados", "formated_polarization_df_data", "polarization_testing", "education_polarization.rds"))


pol_test_dif[-1]

## $IC
##                2.5%       97.5%
## DNC-GOP -0.00308642 0.041666667
## DNC-PRG -0.02103996 0.004781421
## 
## $resultado
## [1] "No se encuentran diferencias en medias"

# Nivel de polarizacion mínimo en ambos casos pero se hallan diferencias significativas
pol_test_dif$data %>% 
  ggplot(aes(x = sentimiento)) + 
  geom_histogram(aes(group = comunidad, fill = comunidad), color = "white", alpha = 0.5, binwidth = 0.003, position = "identity") + 
  scale_fill_manual(values = c(plots_palette[1], plots_palette[4])) + 
  labs(title = "Replicaciones bootstrap del scoring de polarización", x = "Average polarization score")