youtubesub

library(tm)

## Warning: package 'tm' was built under R version 4.3.2

## Loading required package: NLP

# Ruta de la carpeta que contiene los archivos de texto
carpetayt2018 <- "C:/Users/DELL/Desktop/Métodos cuantitativos/subtítulo youtube/yt_2018"

# Lista para almacenar el texto de cada archivo
textosyt2018 <- list()

# Recorre cada archivo en la carpeta
archivosyt2018 <- list.files(carpetayt2018, pattern = "\\.txt$", full.names = TRUE)
for (archivoyt2018 in archivosyt2018) {
  # Cargar el texto del archivo
  texto_brutoyt2018 <- tolower(readLines(archivoyt2018, warn = FALSE))
  
  # Eliminar URL
  texto_limpioyt2018 <- gsub("http\\S+|www\\S+|ftp\\S+", "", texto_brutoyt2018)
  
  # Eliminar HTML
  texto_limpioyt2018 <- gsub("<.*?>", "", texto_limpioyt2018)
  
  # Eliminar signos de puntuación
  texto_limpioyt2018 <- gsub("[[:punct:]]", "", texto_limpioyt2018)
  
  # Agregar el texto limpio a la lista
  textosyt2018 <- c(textosyt2018, texto_limpioyt2018)
}

# Convertir la lista de textos en un solo texto
texto_finalyt2018 <- paste(textosyt2018, collapse = " ")

# Instala y carga las bibliotecas necesarias
library(tm)
library(proxy)

## 
## Attaching package: 'proxy'

## The following objects are masked from 'package:stats':
## 
##     as.dist, dist

## The following object is masked from 'package:base':
## 
##     as.matrix

library(cluster)
library(wordcloud)

## Warning: package 'wordcloud' was built under R version 4.3.2

## Loading required package: RColorBrewer

# Usa directamente el texto preprocesado
todos_los_textosyt2018 <- tolower(texto_finalyt2018[[1]])

# Elimina stopwords
todos_los_textosyt2018  <- removeWords(todos_los_textosyt2018 , stopwords("spanish"))
palabras_eliminaryt2018  <- c("pues", "así", "decir", "además", "vez", "día", "dos", "cada", "cada", "hace", "aquí", "precisamente", "cuáles", "ahí", "parte", "entonces", "casi", "través","muchísimas", "todas", "ahora", "puede", "solamente")
todos_los_textosyt2018  <- gsub(paste(palabras_eliminaryt2018 , collapse = "|"), "", todos_los_textosyt2018 , ignore.case = TRUE) 

corpusyt2018  <- Corpus(VectorSource(todos_los_textosyt2018))
dtmyt2018  <- DocumentTermMatrix(corpusyt2018)


conteo_totalyt2018  <- colSums(as.matrix(dtmyt2018 ))
terminos_comunesyt2018  <- sort(conteo_totalyt2018, decreasing = TRUE)

# Data Frame para la tabla de frecuencias 
Frecuenciayt2018  <- data.frame(termino = names(terminos_comunesyt2018 ), conteo = terminos_comunesyt2018 )
head(Frecuenciayt2018 , 25)

##                     termino conteo
## indígenas         indígenas    164
## educación         educación    111
## indígena           indígena    101
## pueblos             pueblos     65
## lengua               lengua     64
## universidad     universidad     52
## población         población     48
## lenguas             lenguas     46
## niños                 niños     43
## bueno                 bueno     39
## importante       importante     37
## nacional           nacional     37
## escuelas           escuelas     34
## comunidades     comunidades     33
## jóvenes             jóvenes     32
## intercultural intercultural     31
## país                   país     31
## desarrollo       desarrollo     30
## gracias             gracias     28
## cultura             cultura     27
## méxico               méxico     27
## nivel                 nivel     27
## primaria           primaria     25
## programas         programas     25
## años                   años     24

library(wordcloud)
wordcloud(words = Frecuenciayt2018 $termino,
          freq = Frecuenciayt2018 $conteo,
          min.freq = 5,
          max.words = 100, random.order = FALSE, rot.per = 0.2, colors = brewer.pal(10,"Paired"))

library(tm)
library(topicmodels)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(tidytext)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Definir el número de tópicos
num_topics <- 5

# Generar el modelo LDA
lda_modelyt2018 <- LDA(dtmyt2018, k = num_topics)

# Obtener los términos más importantes de cada tópico
termsyt2018 <- tidy(lda_modelyt2018, matrix = "beta") %>%
  group_by(topic) %>%
  top_n(5, wt = beta)

# Imprimir los términos más importantes de cada tópico
cat("Términos más importantes de cada tópico:\n")

## Términos más importantes de cada tópico:

ggplot(termsyt2018, aes(x = term, y = beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~topic, scales = "free_y", ncol = 3) +
  labs(title = "Distribución de Tópicos 2018",
       x = "Término",
       y = "Peso Beta") +
  theme_minimal()

# Cargar bibliotecas
# Instalar y cargar la biblioteca para análisis de emociones
library(text)

## Warning: package 'text' was built under R version 4.3.2

## [0;34mThis is text (version 1.0).
## [0m[0;32mText is new and still rapidly improving.
##                
## Newer versions may have improved functions and updated defaults to reflect current understandings of the state-of-the-art.
##                Please send us feedback based on your experience.[0m[0;35m
## 
## Please note that defaults has changed in the textEmbed-functions since last version; see help(textEmbed) or www.r-text.org for more details.[0m

library(dplyr)

# Calcular las puntuaciones de emociones con la biblioteca 'text'
puntuaciones_emocionesyt2018 <- get_nrc_sentiment(todos_los_textosyt2018, language = "spanish")

# Traducción manual de nombres de emociones (puedes ajustar según tus necesidades)
sentimientos_espanolyt2018 <- c("ira", "anticipacion", "disgusto", "miedo", "alegria", "desconfianza", "positivo", "tristeza", "asombro", "confianza")

# Asignar los nombres traducidos al data frame
colnames(puntuaciones_emocionesyt2018) <- sentimientos_espanolyt2018

# Crear un data frame con los sentimientos y sus valores
sentimientos_dfyt2018 <- data.frame(sentimiento = names(colSums(puntuaciones_emocionesyt2018)), valor = colSums(puntuaciones_emocionesyt2018))

# Ordenar el data frame por frecuencia de sentimientos
sentimientos_dfyt2018 <- sentimientos_dfyt2018 %>% arrange(desc(valor))

# Gráfico de barras para las puntuaciones de emociones
library(ggplot2)
ggplot(sentimientos_dfyt2018, aes(x = reorder(sentimiento, -valor), y = valor, fill = sentimiento)) +
  geom_bar(stat = "identity") +
  labs(title = "Puntuaciones de emociones 2018", x = "Sentimiento", y = "Frecuencia") +
  theme_minimal() + scale_fill_brewer(palette = "Set3") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotar etiquetas del eje x

#AÑO 2019

library(tm)

# Ruta de la carpeta que contiene los archivos de texto
carpetayt2019 <- "C:/Users/DELL/Desktop/Métodos cuantitativos/subtítulo youtube/yt_2019"

# Lista para almacenar el texto de cada archivo
textosyt2019 <- list()

# Recorre cada archivo en la carpeta
archivosyt2019 <- list.files(carpetayt2019, pattern = "\\.txt$", full.names = TRUE)
for (archivoyt2019 in archivosyt2019) {
  # Cargar el texto del archivo
  texto_brutoyt2019 <- tolower(readLines(archivoyt2019, warn = FALSE))
  
  # Eliminar URL
  texto_limpioyt2019 <- gsub("http\\S+|www\\S+|ftp\\S+", "", texto_brutoyt2019)
  
  # Eliminar HTML
  texto_limpioyt2019 <- gsub("<.*?>", "", texto_limpioyt2019)
  
  # Eliminar signos de puntuación
  texto_limpioyt2019 <- gsub("[[:punct:]]", "", texto_limpioyt2019)
  
  # Agregar el texto limpio a la lista
  textosyt2019 <- c(textosyt2019, texto_limpioyt2019)
}

# Convertir la lista de textos en un solo texto
texto_finalyt2019 <- paste(textosyt2019, collapse = " ")

# Instala y carga las bibliotecas necesarias
library(tm)
library(proxy)
library(cluster)
library(wordcloud)

# Usa directamente el texto preprocesado
todos_los_textosyt2019 <- tolower(texto_finalyt2019[[1]])

# Elimina stopwords
todos_los_textosyt2019  <- removeWords(todos_los_textosyt2019 , stopwords("spanish"))
palabras_eliminaryt2019  <- c("aquí", "cada", "este","éste","ahorita", "donde", "dónde", "veces", "creo", "alguna", "manera", "hace", "tan", "cuál", "tal", "ahora", "hacemos", "usted","digo", "haber", "casi","ahí", "así", "pues", "parte","entonces", "cómo","voy")
todos_los_textosyt2019  <- gsub(paste(palabras_eliminaryt2019 , collapse = "|"), "", todos_los_textosyt2019 , ignore.case = TRUE) 

corpusyt2019  <- Corpus(VectorSource(todos_los_textosyt2019))
dtmyt2019  <- DocumentTermMatrix(corpusyt2019)


conteo_totalyt2019  <- colSums(as.matrix(dtmyt2019))
terminos_comunesyt2019  <- sort(conteo_totalyt2019, decreasing = TRUE)

# Data Frame para la tabla de frecuencias 
Frecuenciayt2019  <- data.frame(termino = names(terminos_comunesyt2019 ), conteo = terminos_comunesyt2019)
head(Frecuenciayt2019 , 25)

##             termino conteo
## música       música    185
## indígenas indígenas     57
## indígena   indígena     34
## muchas       muchas     27
## méxico       méxico     27
## gracias     gracias     26
## bueno         bueno     25
## oaxaca       oaxaca     20
## aplausos   aplausos     19
## pueblos     pueblos     19
## vamos         vamos     19
## gente         gente     18
## bien           bien     17
## decir         decir     17
## hecho         hecho     17
## trabajo     trabajo     17
## años           años     16
## personas   personas     15
## ver             ver     15
## verdad       verdad     15
## amigos       amigos     14
## ciudad       ciudad     14
## cultura     cultura     14
## mexico       mexico     14
## pan             pan     14

library(wordcloud)
wordcloud(words = Frecuenciayt2019 $termino,
          freq = Frecuenciayt2019 $conteo,
          min.freq = 3,
          max.words = 100, random.order = FALSE, rot.per = 0.2, colors = brewer.pal(10,"Paired"))

library(tm)
library(topicmodels)
library(ggplot2)
library(tidytext)
library(dplyr)

# Definir el número de tópicos
num_topics <- 5

# Generar el modelo LDA
lda_modelyt2019 <- LDA(dtmyt2019, k = num_topics)

# Obtener los términos más importantes de cada tópico
termsyt2019 <- tidy(lda_modelyt2019, matrix = "beta") %>%
  group_by(topic) %>%
  top_n(5, wt = beta)

# Imprimir los términos más importantes de cada tópico
cat("Términos más importantes de cada tópico:\n")

## Términos más importantes de cada tópico:

print(termsyt2019)

## # A tibble: 25 × 3
## # Groups:   topic [5]
##    topic term         beta
##    <int> <chr>       <dbl>
##  1     3 bueno     0.00588
##  2     1 cultura   0.00759
##  3     4 gente     0.00800
##  4     2 gracias   0.00749
##  5     3 gracias   0.0116 
##  6     4 gracias   0.00806
##  7     1 indígena  0.0116 
##  8     2 indígena  0.00922
##  9     1 indígenas 0.0177 
## 10     2 indígenas 0.00708
## # ℹ 15 more rows

# Graficar la distribución de documentos en tópicos
ggplot(termsyt2019, aes(x = term, y = beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~topic, scales = "free_y", ncol = 3) +
  labs(title = "Distribución de Tópicos 2019",
       x = "Término",
       y = "Peso Beta") +
  theme_minimal()

# Extra: Tabla de términos y pesos beta
terms_tableyt2019 <- tidy(lda_modelyt2019, matrix = "beta") %>%
  select(term, beta)

# Visualizar la tabla de términos
cat("\nTabla de términos y pesos beta:\n")

## 
## Tabla de términos y pesos beta:

print(terms_tableyt2019)

## # A tibble: 10,700 × 2
##    term        beta
##    <chr>      <dbl>
##  1 100   0.000300  
##  2 100   0.000181  
##  3 100   0.000260  
##  4 100   0.000182  
##  5 100   0.000155  
##  6 166   0.00000970
##  7 166   0.000102  
##  8 166   0.000105  
##  9 166   0.000426  
## 10 166   0.000498  
## # ℹ 10,690 more rows

library(text)
library(dplyr)

# Calcular las puntuaciones de emociones con la biblioteca 'text'
puntuaciones_emocionesyt2019 <- get_nrc_sentiment(todos_los_textosyt2019, language = "spanish")

# Traducción manual de nombres de emociones (puedes ajustar según tus necesidades)
sentimientos_espanolyt2019 <- c("ira", "anticipacion", "disgusto", "miedo", "alegria", "desconfianza", "positivo", "tristeza", "asombro", "confianza")

# Asignar los nombres traducidos al data frame
colnames(puntuaciones_emocionesyt2019) <- sentimientos_espanolyt2019

# Crear un data frame con los sentimientos y sus valores
sentimientos_dfyt2019 <- data.frame(sentimiento = names(colSums(puntuaciones_emocionesyt2019)), valor = colSums(puntuaciones_emocionesyt2019))

# Ordenar el data frame por frecuencia de sentimientos
sentimientos_dfyt2019 <- sentimientos_dfyt2019 %>% arrange(desc(valor))

# Gráfico de barras para las puntuaciones de emociones
library(ggplot2)
ggplot(sentimientos_dfyt2019, aes(x = reorder(sentimiento, -valor), y = valor, fill = sentimiento)) +
  geom_bar(stat = "identity") +
  labs(title = "Puntuaciones de emociones 2019", x = "Sentimiento", y = "Frecuencia") +
  theme_minimal() + scale_fill_brewer(palette = "Set3") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotar etiquetas del eje x

#AÑO 2020

library(tm)

# Ruta de la carpeta que contiene los archivos de texto
carpetayt2020 <- "C:/Users/DELL/Desktop/Métodos cuantitativos/subtítulo youtube/yt_2020"

# Lista para almacenar el texto de cada archivo
textosyt2020 <- list()

# Recorre cada archivo en la carpeta
archivosyt2020 <- list.files(carpetayt2020, pattern = "\\.txt$", full.names = TRUE)
for (archivoyt2020 in archivosyt2020) {
  # Cargar el texto del archivo
  texto_brutoyt2020 <- tolower(readLines(archivoyt2020, warn = FALSE))
  
  # Eliminar URL
  texto_limpioyt2020 <- gsub("http\\S+|www\\S+|ftp\\S+", "", texto_brutoyt2020)
  
  # Eliminar HTML
  texto_limpioyt2020 <- gsub("<.*?>", "", texto_limpioyt2020)
  
  # Eliminar signos de puntuación
  texto_limpioyt2020 <- gsub("[[:punct:]]", "", texto_limpioyt2020)
  
  # Agregar el texto limpio a la lista
  textosyt2020 <- c(textosyt2020, texto_limpioyt2020)
}

# Convertir la lista de textos en un solo texto
texto_finalyt2020 <- paste(textosyt2020, collapse = " ")

# Crear un corpus
# Usa directamente el texto preprocesado
todos_los_textosyt2020 <- tolower(texto_finalyt2020[[1]])

# Elimina stopwords
todos_los_textosyt2020 <- removeWords(todos_los_textosyt2020, stopwords("spanish"))
palabras_eliminaryt2020 <- c("podría", "días", "dias", "dos", "debera", "aquí", "decir", "vez", "puedes", "hoy", "cada", "veces", "así", "tan","creo","casi", "tipo", "sólo", "quizá", "cuales", "pues", "luego", "digamos", "dígamos", "entonces", "alguna", "pregunta","realmente", "todas", "tema", "índice","manera")
todos_los_textosyt2020 <- gsub(paste(palabras_eliminaryt2020, collapse = "|"), "", todos_los_textosyt2020, ignore.case = TRUE) 


corpusyt2020 <- Corpus(VectorSource(todos_los_textosyt2020))
dtmyt2020 <- DocumentTermMatrix(corpusyt2020)


conteo_totalyt2020 <- colSums(as.matrix(dtmyt2020))
terminos_comunesyt2020 <- sort(conteo_totalyt2020, decreasing = TRUE)

# Data Frame para la tabla de frecuencias 
Frecuenciayt2020 <- data.frame(termino = names(terminos_comunesyt2020), conteo = terminos_comunesyt2020)
head(Frecuenciayt2020, 25)

##                 termino conteo
## indígenas     indígenas     72
## pueblos         pueblos     34
## música           música     29
## municipios   municipios     26
## méxico           méxico     26
## país               país     26
## indígena       indígena     20
## lenguas         lenguas     19
## letalidad     letalidad     17
## vídeo             vídeo     17
## momento         momento     16
## comunidades comunidades     15
## cultura         cultura     15
## población     población     15
## gran               gran     12
## grupos           grupos     12
## localidades localidades     12
## nacional       nacional     12
## mundo             mundo     11
## número           número     11
## parte             parte     11
## realidad       realidad     11
## vida               vida     11
## ciento           ciento     10
## derechos       derechos     10

wordcloud(words = Frecuenciayt2020$termino,
          freq = Frecuenciayt2020$conteo,
          min.freq = 1,
          max.words = 100, random.order = FALSE, rot.per = 0.2, colors = brewer.pal(10,"Paired"),
          family="Arial")

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

library(tm)
library(topicmodels)
library(ggplot2)
library(tidytext)
library(dplyr)

# Definir el número de tópicos
num_topics <- 5

# Generar el modelo LDA
lda_modelyt2020 <- LDA(dtmyt2020, k = num_topics)

# Obtener los términos más importantes de cada tópico
termsyt2020 <- tidy(lda_modelyt2020, matrix = "beta") %>%
  group_by(topic) %>%
  top_n(5, wt = beta)

# Imprimir los términos más importantes de cada tópico
cat("Términos más importantes de cada tópico:\n")

## Términos más importantes de cada tópico:

print(termsyt2020)

## # A tibble: 25 × 3
## # Groups:   topic [5]
##    topic term           beta
##    <int> <chr>         <dbl>
##  1     2 comunidades 0.0115 
##  2     5 comunidades 0.00848
##  3     2 indígena    0.0134 
##  4     4 indígena    0.0133 
##  5     1 indígenas   0.0339 
##  6     3 indígenas   0.0394 
##  7     4 indígenas   0.00909
##  8     5 indígenas   0.0215 
##  9     4 lenguas     0.0117 
## 10     5 localidades 0.00836
## # ℹ 15 more rows

# Graficar la distribución de documentos en tópicos
ggplot(termsyt2020, aes(x = term, y = beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~topic, scales = "free_y", ncol = 3) +
  labs(title = "Distribución de Tópicos 2020",
       x = "Término",
       y = "Peso Beta") +
  theme_minimal()

# Extra: Tabla de términos y pesos beta
terms_tableyt2020 <- tidy(lda_modelyt2020, matrix = "beta") %>%
  select(term, beta)

# Visualizar la tabla de términos
cat("\nTabla de términos y pesos beta:\n")

## 
## Tabla de términos y pesos beta:

print(terms_tableyt2020)

## # A tibble: 7,890 × 2
##    term      beta
##    <chr>    <dbl>
##  1 100   0.000139
##  2 100   0.000631
##  3 100   0.000266
##  4 100   0.000517
##  5 100   0.000248
##  6 1948  0.000327
##  7 1948  0.000307
##  8 1948  0.000340
##  9 1948  0.000382
## 10 1948  0.000318
## # ℹ 7,880 more rows

# Instalar y cargar la biblioteca para análisis de emociones

library(text)
library(dplyr)

# Calcular las puntuaciones de emociones con la biblioteca 'text'
puntuaciones_emocionesyt2020 <- get_nrc_sentiment(todos_los_textosyt2020, language = "spanish")

# Traducción manual de nombres de emociones (puedes ajustar según tus necesidades)
sentimientos_espanolyt2020 <- c("ira", "anticipacion", "disgusto", "miedo", "alegria", "desconfianza", "positivo", "tristeza", "asombro", "confianza")

# Asignar los nombres traducidos al data frame
colnames(puntuaciones_emocionesyt2020) <- sentimientos_espanolyt2020

# Crear un data frame con los sentimientos y sus valores
sentimientos_dfyt2020 <- data.frame(sentimiento = names(colSums(puntuaciones_emocionesyt2020)), valor = colSums(puntuaciones_emocionesyt2020))

# Ordenar el data frame por frecuencia de sentimientos
sentimientos_dfyt2020 <- sentimientos_dfyt2020 %>% arrange(desc(valor))

# Gráfico de barras para las puntuaciones de emociones
library(ggplot2)
ggplot(sentimientos_dfyt2020, aes(x = reorder(sentimiento, -valor), y = valor, fill = sentimiento)) +
  geom_bar(stat = "identity") +
  labs(title = "Puntuaciones de emociones 2020", x = "Sentimiento", y = "Frecuencia") +
  theme_minimal() + scale_fill_brewer(palette = "Set3") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotar etiquetas del eje x

youtubesub

Yamileth

2023-11-15