Evolución del discurso body positive y gordofobico desde el 2015 al 2023

Análisis del 2015

Palabras frecuentes por año (Wordclouds)

Se instalan las librerias que se utilizaran durante todo el proceso.

library(rvest)
library(magrittr)
library(httr)
library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:httr':
## 
##     content

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

Carga de archivos

Se analizan 8 noticias, 1 vídeo de youtube

# Ruta de la carpeta donde están los archivos seleccionados
carpeta_destino.15 <- "/Users/ahtzirigarcia/Desktop/RETO/Vídeos de youtube/2015"

# Listar todos los archivos en la carpeta de destino
archivos_en_carpeta.15 <- list.files(path = carpeta_destino.15)

Limpieza de archivos

# Crear una función para limpiar el texto
limpiar_texto.15 <- function(archivo.15) {
  # Leer el contenido del archivo
  contenido.15 <- readLines(file.path(carpeta_destino.15, archivo.15), warn = FALSE)
  contenido.15 <- paste(contenido.15, collapse = " ")
  
  # Eliminar URLs
  contenido.15 <- gsub("http\\S+|www\\.\\S+", "", contenido.15)
  
  # Eliminar etiquetas HTML
  contenido.15 <- gsub("<.*?>", "", contenido.15)
  
  # Eliminar puntuación
  contenido.15 <- gsub("[[:punct:]]", "", contenido.15)
  
  # Eliminar números
  contenido.15 <- gsub("\\d+", "", contenido.15)
  
  # Eliminar espacios extra
  contenido.15 <- gsub("\\s+", " ", contenido.15)
  
  # Convertir a minúsculas
  contenido.15 <- tolower(contenido.15)
  
  return(contenido.15)
}

# Aplicar la función de limpieza a cada archivo en la carpeta
textos_limpios.15 <- lapply(archivos_en_carpeta.15, limpiar_texto.15)

Unimos en un mismo vector la función anterior

# Crear un vector llamado "Texto Limpio"
textos_limpios.15 <- lapply(archivos_en_carpeta.15, limpiar_texto.15)

# Combinar los textos limpios en un solo vector
TextoLimpio.15 <- unlist(textos_limpios.15)

Análisis Exploratorio en el que se crea la Matriz de Términos de Documento e identifica y lista los términos más frecuentes en el conjunto de datos.

library(tm)

# Crear un Corpus con los textos 
corpus.15 <- Corpus(VectorSource(TextoLimpio.15))

#Documetn Term Matrix
dtm.15 <- DocumentTermMatrix(corpus.15)

Se eliminan stopwords y palabras específicas

# Lista de palabras específicas no deseadas
palabras_no_deseadas.15 <- c("años","además","pues","aquí","vamos","hoy","entonces","cada","hacia","vez","hacia","parte","así","ahí","dos","gamma","ácidoslactobacilos")

# Eliminar stopwords de nuestro corpus
corpus_limpio.15 <- tm_map(corpus.15, removeWords, stopwords("es"))

## Warning in tm_map.SimpleCorpus(corpus.15, removeWords, stopwords("es")):
## transformation drops documents

# Eliminar palabras específicas no deseadas
corpus_limpio.15 <- tm_map(corpus_limpio.15, removeWords, palabras_no_deseadas.15)

## Warning in tm_map.SimpleCorpus(corpus_limpio.15, removeWords,
## palabras_no_deseadas.15): transformation drops documents

# Crear el nuevo DocumentTermMatrix
dtm_limpio.15<- DocumentTermMatrix(corpus_limpio.15)

# Sumar las columnas para obtener el conteo total de cada término
conteo_total_limpio.15 <- colSums(as.matrix(dtm_limpio.15))

# Ordenar y mostrar los términos más comunes
terminos_comunes_limpio.15 <- sort(conteo_total_limpio.15, decreasing = TRUE)

Se crea la visualización de datos

library(ggplot2)

# Definir una lista de colores personalizada
colores <- c("#8A2BE2", "#FF5F8D", "#FFAE66", "#0038B3", "#66CCFF", "#B38EB3", "#EEDD44", "#805380", "#FF6944", "#008E00", "#4E9B71", "#45A6E6", "#A89400", "#D27B45", "#A88945")


# Obtener los 15 términos más comunes
top_15_terminos.15 <- head(terminos_comunes_limpio.15, 15)

# Crear un data frame con los términos y sus conteos
data.15 <- data.frame(Término = names(top_15_terminos.15), Conteo = top_15_terminos.15)


# Crear la gráfica de barras sin líneas de fondo
ggplot(data.15, aes(x = reorder(Término, -Conteo), y = Conteo)) +
  geom_bar(stat = "identity", fill = colores) +
  labs(title = "Los 15 términos más comunes en el discurso del 2015", x = NULL, y = "Frecuencia") +
  theme_minimal() + # Establecer un tema minimalista
  theme(panel.grid.major = element_blank(), # Eliminar líneas de la cuadrícula mayor
        panel.grid.minor = element_blank(), # Eliminar líneas de la cuadrícula menor
        axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold", vjust = -7)) # Ajustar la orientación del texto del eje x, centrar el título en negritas y ajustar la altura del título

# Tomar los 15 términos más comunes
terminos_top.15<- names(terminos_comunes_limpio.15)[1:15]

# Crear un vector con las frecuencias de estos términos
frecuencias_top.15 <- terminos_comunes_limpio.15[1:15]


# Invertir el orden de los vectores
terminos_top.15 <- rev(terminos_top.15)
frecuencias_top.15 <- rev(frecuencias_top.15)

# Crear un gráfico de barras con colores y ajustes de tamaño de letra
barplot(frecuencias_top.15, 
        names.arg = terminos_top.15, 
        horiz = TRUE, 
        las = 1, 
        main = "Los 15 términos más comunes en el discurso del 2015", 
        xlab = "Frecuencia", 
        col = colores,  # Agregar los colores personalizados
        border = NA,     # Color de los bordes de las barras
        cex.names = .7,      # Tamaño de letra en el eje x
        cex.axis = .8,       # Tamaño de letra en los ejes
        cex.main = 1.2,       # Tamaño de letra en el título principal
        cex.lab = 0.9         # Tamaño de letra en las etiquetas de los ejes
)

# Conjunto de palabras que deseas contar
conjunto_palabras <- c("gordofobia", "discriminación", "belleza","mujeres")  # Agrega las palabras que desees

# Mostrar el conteo individual de cada palabra en el conjunto
for (palabra in conjunto_palabras) {
  conteo_palabra <- conteo_total_limpio.15[palabra]
  print(paste("La palabra", palabra, "aparece", conteo_palabra, "veces."))
}

## [1] "La palabra gordofobia aparece 18 veces."
## [1] "La palabra discriminación aparece 6 veces."
## [1] "La palabra belleza aparece 9 veces."
## [1] "La palabra mujeres aparece 23 veces."

library(wordcloud)

## Loading required package: RColorBrewer

# Crear la nube de palabras
# se van a plotear los términos comunes (base) que aparecen mínimo de 10 veces y máximo 200. 
wordcloud(names(terminos_comunes_limpio.15), terminos_comunes_limpio.15, min.freq = 10, max.words = 150, random.order = FALSE, rot.per = 0.3, colors = brewer.pal(8, "Dark2"))

Análisis de sentimiento

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidytext)
library(syuzhet)
corpus_vector_2.15 <- unlist(sapply(corpus_limpio.15 , as.character))

#Análisis de sentimiento con el paquete nrc,asigna etiquetas de sentimiento (por ejemplo, positivo, negativo o neutro) a palabras en un texto basándose en un conjunto de datos predefinido que asocia palabras con emociones o sentimientos específicos. Finalmente, contamos los sentimientos utilizando count().


youtube.sentimientos.15 <- data.frame(text = corpus_vector_2.15) %>%
  unnest_tokens(word, text) %>%
  inner_join(get_sentiment_dictionary("nrc", language = "spanish")) %>%
  inner_join(get_sentiments("nrc")) %>%
  count(sentiment)

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiment_dictionary("nrc", language = "spanish")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 29 of `x` matches multiple rows in `y`.
## ℹ Row 4394 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

## Joining with `by = join_by(word, sentiment)`

print (youtube.sentimientos.15)

##       sentiment   n
## 1         anger   9
## 2  anticipation   5
## 3       disgust  28
## 4          fear  12
## 5           joy   3
## 6      negative  33
## 7      positive 108
## 8       sadness  11
## 9      surprise   4
## 10        trust  24

# Obtener los nombres predeterminados de los sentimientos del diccionario NRC
nombres_sentimientos <- get_sentiments("nrc")

ggplot(youtube.sentimientos.15, aes(x = sentiment, y = n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  labs(title = "Distribución de Sentimientos en el discurso del 2015",
       x = NULL, y = "Frecuencia") +
  scale_fill_brewer(palette = "Set3") +  # Puedes elegir otra paleta de colores
  scale_x_discrete(labels = nombres_sentimientos) +  # Cambiar etiquetas de sentimientos a español
  theme_minimal() +
  theme(panel.grid.major = element_blank(),  # Eliminar líneas de la cuadrícula mayor
        panel.grid.minor = element_blank(),  # Eliminar líneas de la cuadrícula menor
        axis.text.x = element_text(face = "bold"),  # Etiquetas de eje x en negritas
        plot.title = element_text(hjust = 0.5, face = "bold", vjust = -3)) +  # Ajustar título del gráfico
  guides(fill = FALSE)  # Eliminar la leyenda de colores (fill)

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Análisis de tópico

Ánalisis de solo un txt

# Importar las librerías necesarias
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ✔ readr     2.1.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::annotate()     masks NLP::annotate()
## ✖ NLP::content()          masks httr::content()
## ✖ tidyr::extract()        masks magrittr::extract()
## ✖ dplyr::filter()         masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag()            masks stats::lag()
## ✖ purrr::set_names()      masks magrittr::set_names()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidytext)
library(dplyr)
library(sentimentr)

## 
## Attaching package: 'sentimentr'
## 
## The following object is masked from 'package:syuzhet':
## 
##     get_sentences

library(topicmodels)
library(tm)
library(ggplot2)

#MEJORA: realizar un loop o algoritmo para analizar todos los textos
# HINT: agrupen sus textos en txt NO LOS SOBREESCRIBAN UNO DEL OTRO
# Leer el archivo txt
transcription <- read_file("/Users/ahtzirigarcia/Desktop/RETO/Vídeos de youtube/2015/2015_noticia_72.txt")
# Preprocesar el texto
transcription <- gsub("\n", " ", transcription)
transcription <- tolower(transcription)
transcription <- removePunctuation(transcription)
transcription <- removeWords(transcription, stopwords("spanish"))
# Analizar el sentimiento
sentiment <- sentiment_by(transcription)
# Convertir el texto en un Corpus
corpus <- Corpus(VectorSource(transcription))
# Convertir el texto en un DocumentTermMatrix
dtm<- DocumentTermMatrix(corpus)
# Generar un modelo de topic models
lda<- LDA(dtm, k = 3)
# Obtener los términos más importantes de cada tópico y seleccionar solo los primeros 10 términos
terms <- tidy(lda, matrix = "beta") %>%
  group_by(topic) %>%
  top_n(6, wt = beta)

Ánalisis de toda la carpeta

# Crear una función para analizar un archivo de texto
analyze_text <- function(file_path) {
  # Leer el archivo txt
  transcription <- read_file(file_path)
  # Preprocesar el texto
  transcription <- gsub("\n", " ", transcription)
  transcription <- tolower(transcription)
  transcription <- removePunctuation(transcription)
  transcription <- removeWords(transcription, stopwords("spanish"))
  # Verificar que el documento no esté vacío
  if (nchar(transcription) > 0) {
    # Convertir el texto en un Corpus
    corpus <- Corpus(VectorSource(transcription))
    # Convertir el texto en un DocumentTermMatrix
    dtm <- DocumentTermMatrix(corpus)
    # Verificar que el DTM contenga al menos un término
    if (length(dtm$dimnames$Terms) > 0) {
      # Generar un modelo de topic models
      lda <- LDA(dtm, k = 3)
      # Obtener los términos más importantes de cada tópico
      terms <- tidy(lda, matrix = "beta") %>%
        group_by(term) %>%
        summarize(frequency = sum(beta))
      return(terms)
    }
  }
  return(NULL) # Si el documento no contiene datos válidos
}
# Analizar el sentimiento
sentiment <- sentiment_by(transcription)
# Convertir el texto en un Corpus
corpus<- Corpus(VectorSource(transcription))
# Convertir el texto en un DocumentTermMatrix
dtm <- DocumentTermMatrix(corpus)
# Generar un modelo de topic models
lda<- LDA(dtm, k = 3)

# Ruta de la carpeta que contiene los archivos de texto
carpeta_textos.15<-  "/Users/ahtzirigarcia/Desktop/RETO/Vídeos de youtube/2015"

# Listar los archivos de texto en la carpeta
archivos_texto <- list.files(path = carpeta_textos.15, pattern = "\\.txt$|\\.csv$", full.names = TRUE)

# Inicializar una lista para almacenar los resultados
resultados <- list()

# Iterar a través de los archivos y analizar cada uno
for (archivo in archivos_texto) {
  resultado <- analyze_text(archivo)
  if (!is.null(resultado)) {
    resultados[[archivo]] <- resultado
  }
}

# Combinar y filtrar los resultados de términos más frecuentes
resultados_combinados <- bind_rows(resultados)
top_terms <- resultados_combinados %>%
  group_by(term)

# Obtener los términos más importantes de cada tópico
terms <- tidy(lda, matrix = "beta")

#Para limpiar palabras del análisis de sentimiento: 

# Lista de términos a eliminar de la visualización
terminos_a_eliminar <- c("persona","grupos","social","pues")

# Ordenar los términos por beta de mayor a menor y seleccionar los 15 primeros
top_terms <- terms %>%
  filter(!term %in% terminos_a_eliminar) %>%
  arrange(desc(beta)) %>%
  top_n(15)

## Selecting by beta

# Agregar la información del tema a resultados_combinados
resultados_combinados <- resultados_combinados %>%
  right_join(top_terms, by = c("term" = "term"))

## Warning in right_join(., top_terms, by = c(term = "term")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1262 of `x` matches multiple rows in `y`.
## ℹ Row 12 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

# Visualizar los términos más frecuentes con facet_wrap
# Visualizar los términos más frecuentes con facet_wrap
ggplot(resultados_combinados, aes(x = reorder(term, beta), y = beta, fill = factor(topic))) +
  geom_col() +
  coord_flip() +
  facet_wrap(~ topic, ncol = 3) +
  labs(title = "Los términos más importantes en cada tópico del 2015",
       x = NULL, y = "Importancia") +
  scale_fill_manual(values = colores) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", vjust = -1)  # Ajustar el título en negritas y posición vertical
  ) +
  guides(fill = FALSE)  # Eliminar la leyenda de colores (fill)

Análisis del 2016

Carga de archivos

Se analizan 10 noticias, 1 vídeo de youtube y 2 fuentes de gobierno

# Ruta de la carpeta donde están los archivos seleccionados
carpeta_destino.16 <- "/Users/ahtzirigarcia/Desktop/RETO/Vídeos de youtube/2016"

# Listar todos los archivos en la carpeta de destino
archivos_en_carpeta.16 <- list.files(path = carpeta_destino.16)

Limpieza de archivos

# Crear una función para limpiar el texto
limpiar_texto.16 <- function(archivo.16) {
  # Leer el contenido del archivo
  contenido.16 <- readLines(file.path(carpeta_destino.16, archivo.16), warn = FALSE)
  contenido.16 <- paste(contenido.16, collapse = " ")
  
  # Eliminar URLs
  contenido.16 <- gsub("http\\S+|www\\.\\S+", "", contenido.16)
  
  # Eliminar etiquetas HTML
  contenido.16 <- gsub("<.*?>", "", contenido.16)
  
  # Eliminar puntuación
  contenido.16 <- gsub("[[:punct:]]", "", contenido.16)
  
  # Eliminar números
  contenido.16 <- gsub("\\d+", "", contenido.16)
  
  # Eliminar espacios extra
  contenido.16 <- gsub("\\s+", " ", contenido.16)
  
  # Convertir a minúsculas
  contenido.16 <- tolower(contenido.16)
  
  return(contenido.16)
}

# Aplicar la función de limpieza a cada archivo en la carpeta
textos_limpios.16 <- lapply(archivos_en_carpeta.16, limpiar_texto.16)

Unimos en un mismo vector la función anterior

# Crear un vector llamado "Texto Limpio"
textos_limpios.16 <- lapply(archivos_en_carpeta.16, limpiar_texto.16)

# Combinar los textos limpios en un solo vector
TextoLimpio.16 <- unlist(textos_limpios.16)

Análisis Exploratorio en el que se crea la Matriz de Términos de Documento e identifica y lista los términos más frecuentes en el conjunto de datos.

library(tm)

# Crear un Corpus con los textos 
corpus.16 <- Corpus(VectorSource(TextoLimpio.16))

#Documetn Term Matrix
dtm.16 <- DocumentTermMatrix(corpus.16)

Se eliminan stopwords y palabras específicas que no funcionan al análisis

# Lista de palabras específicas no deseadas
palabras_no_deseadas.16 <- c("hoy","así","cfr","cit","tal","tan","intersex","dos","glosario","aimeé","párr")

# Eliminar stopwords de nuestro corpus
corpus_limpio.16 <- tm_map(corpus.16, removeWords, stopwords("es"))

## Warning in tm_map.SimpleCorpus(corpus.16, removeWords, stopwords("es")):
## transformation drops documents

# Eliminar palabras específicas no deseadas
corpus_limpio.16 <- tm_map(corpus_limpio.16, removeWords, palabras_no_deseadas.16)

## Warning in tm_map.SimpleCorpus(corpus_limpio.16, removeWords,
## palabras_no_deseadas.16): transformation drops documents

# Crear el nuevo DocumentTermMatrix
dtm_limpio.16<- DocumentTermMatrix(corpus_limpio.16)

# Sumar las columnas para obtener el conteo total de cada término
conteo_total_limpio.16 <- colSums(as.matrix(dtm_limpio.16))

# Ordenar y mostrar los términos más comunes
terminos_comunes_limpio.16 <- sort(conteo_total_limpio.16, decreasing = TRUE)

library(ggplot2)

# Obtener los 15 términos más comunes
top_15_terminos.16 <- head(terminos_comunes_limpio.16, 15)

# Crear un data frame con los términos y sus conteos
data.16 <- data.frame(Término = names(top_15_terminos.16), Conteo = top_15_terminos.16)

Se crea la visualización de datos

library(wordcloud)
# Crear la nube de palabras
# se van a plotear los términos comunes (base) que aparecen mínimo de 7 veces y máximo 150. 
wordcloud(names(terminos_comunes_limpio.16), terminos_comunes_limpio.16, min.freq = 8, max.words = 200, random.order = FALSE, rot.per = 0.3, colors = brewer.pal(8, "Dark2"))

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : homofobia could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : sexualidad could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : información could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : organización could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : problemas could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : modelos could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : relación could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : estudio could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : asignado could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : expresiones could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : frente could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : hablamos could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : momento could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : pacientes could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : pensar could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : primero could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : trabajar could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : mañana could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : mientras could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : aquí could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : diciembre could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : guapa could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : estándares could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : propia could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : capacidad could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : intersexuales could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : justicia could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : pleno could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : queer could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : término could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.16),
## terminos_comunes_limpio.16, : yogyakarta could not be fit on page. It will not
## be plotted.

Análisis de sentimiento

corpus_vector_2.16 <- unlist(sapply(corpus_limpio.16 , as.character))

#Análisis de sentimiento con el paquete nrc,asigna etiquetas de sentimiento (por ejemplo, positivo, negativo o neutro) a palabras en un texto basándose en un conjunto de datos predefinido que asocia palabras con emociones o sentimientos específicos. Finalmente, contamos los sentimientos utilizando count().

youtube.sentimientos.16 <- data.frame(text = corpus_vector_2.16) %>%
  unnest_tokens(word, text) %>%
  inner_join(get_sentiments("nrc")) %>%
  count(sentiment)

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("nrc")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 407 of `x` matches multiple rows in `y`.
## ℹ Row 4290 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

# Crear la gráfica de barras sin líneas de fondo
ggplot(data.16, aes(x = reorder(Término, -Conteo), y = Conteo)) +
  geom_bar(stat = "identity", fill = colores) +
  labs(title = "Los 15 términos más comunes en el discurso del 2016", x = NULL, y = "Frecuencia") +
  theme_minimal() + # Establecer un tema minimalista
  theme(panel.grid.major = element_blank(), # Eliminar líneas de la cuadrícula mayor
        panel.grid.minor = element_blank(), # Eliminar líneas de la cuadrícula menor
        axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold", vjust = -7)) # Ajustar la orientación del texto del eje x, centrar el título en negritas y ajustar la altura del título

# Tomar los 15 términos más comunes
terminos_top.16<- names(terminos_comunes_limpio.16)[1:15]

# Crear un vector con las frecuencias de estos términos
frecuencias_top.16 <- terminos_comunes_limpio.16[1:15]


# Invertir el orden de los vectores
terminos_top.16 <- rev(terminos_top.16)
frecuencias_top.16 <- rev(frecuencias_top.16)

# Crear un gráfico de barras con colores y ajustes de tamaño de letra
barplot(frecuencias_top.16, 
        names.arg = terminos_top.16, 
        horiz = TRUE, 
        las = 1, 
        main = "Los 15 términos más comunes en el discurso del 2016", 
        xlab = "Frecuencia", 
        col = colores,  # Agregar los colores personalizados
        border = NA,     # Color de los bordes de las barras
        cex.names = .56,      # Tamaño de letra en el eje x
        cex.axis = .7,       # Tamaño de letra en los ejes
        cex.main = 1.2,       # Tamaño de letra en el título principal
        cex.lab = 0.9         # Tamaño de letra en las etiquetas de los ejes
)

# Conjunto de palabras que deseas contar
conjunto_palabras <- c("gordofobia", "discriminación", "belleza","mujeres")  # Agrega las palabras que desees

# Mostrar el conteo individual de cada palabra en el conjunto
for (palabra in conjunto_palabras) {
  conteo_palabra <- conteo_total_limpio.16[palabra]
  print(paste("La palabra", palabra, "aparece", conteo_palabra, "veces."))
}

## [1] "La palabra gordofobia aparece 34 veces."
## [1] "La palabra discriminación aparece 50 veces."
## [1] "La palabra belleza aparece 12 veces."
## [1] "La palabra mujeres aparece 36 veces."

Análisis de sentimiento

library(ggplot2)
library(dplyr)
library(tidytext)

corpus_vector_2.15 <- unlist(sapply(corpus_limpio.15 , as.character))

#Análisis de sentimiento con el paquete nrc,asigna etiquetas de sentimiento (por ejemplo, positivo, negativo o neutro) a palabras en un texto basándose en un conjunto de datos predefinido que asocia palabras con emociones o sentimientos específicos. Finalmente, contamos los sentimientos utilizando count().


youtube.sentimientos.16<- data.frame(text = corpus_vector_2.16) %>%
  unnest_tokens(word, text) %>%
   inner_join(get_sentiment_dictionary("nrc", language = "spanish")) %>%
  inner_join(get_sentiments("nrc")) %>%
  count(sentiment)

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiment_dictionary("nrc", language = "spanish")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 16 of `x` matches multiple rows in `y`.
## ℹ Row 1588 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

## Joining with `by = join_by(word, sentiment)`

print (youtube.sentimientos.16)

##       sentiment  n
## 1         anger 11
## 2  anticipation  5
## 3       disgust 13
## 4          fear 11
## 5           joy  1
## 6      negative 38
## 7      positive 59
## 8       sadness  9
## 9      surprise  1
## 10        trust 45

# Obtener los nombres predeterminados de los sentimientos del diccionario NRC
nombres_sentimientos <-get_sentiments("nrc")

ggplot(youtube.sentimientos.16, aes(x = sentiment, y = n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  labs(title = "Distribución de Sentimientos en el discurso del 2016",
       x = NULL, y = "Frecuencia") +
  scale_fill_brewer(palette = "Set3") +  # Puedes elegir otra paleta de colores
  scale_x_discrete(labels = nombres_sentimientos) +  # Cambiar etiquetas de sentimientos a español
  theme_minimal() +
  theme(panel.grid.major = element_blank(),  # Eliminar líneas de la cuadrícula mayor
        panel.grid.minor = element_blank(),  # Eliminar líneas de la cuadrícula menor
        axis.text.x = element_text(face = "bold"),  # Etiquetas de eje x en negritas
        plot.title = element_text(hjust = 0.5, face = "bold", vjust = -3)) +  # Ajustar título del gráfico
  guides(fill = FALSE)  # Eliminar la leyenda de colores (fill)

Análisis de tópico

Ánalisis de solo un txt

library(tm)
library(topicmodels)
library(dplyr)
library(ggplot2)
# Leer el archivo txt
transcription <- read_file("/Users/ahtzirigarcia/Desktop/RETO/Vídeos de youtube/2016/2016_noticia_61.txt")
# Preprocesar el texto
transcription <- gsub("\n", " ", transcription)
transcription <- tolower(transcription)
transcription <- removePunctuation(transcription)
transcription <- removeWords(transcription, stopwords("spanish"))
# Analizar el sentimiento
sentiment <- sentiment_by(transcription)
# Convertir el texto en un Corpus
corpus <- Corpus(VectorSource(transcription))
# Convertir el texto en un DocumentTermMatrix
dtm<- DocumentTermMatrix(corpus)
# Generar un modelo de topic models
lda<- LDA(dtm, k = 3)
# Obtener los términos más importantes de cada tópico y seleccionar solo los primeros 10 términos
terms <- tidy(lda, matrix = "beta") %>%
  group_by(topic) %>%
  top_n(6, wt = beta)

Ánalisis de toda la carpeta

# Crear una función para analizar un archivo de texto
analyze_text <- function(file_path) {
  # Leer el archivo txt
  transcription <- read_file(file_path)
  # Preprocesar el texto
  transcription <- gsub("\n", " ", transcription)
  transcription <- tolower(transcription)
  transcription <- removePunctuation(transcription)
  transcription <- removeWords(transcription, stopwords("spanish"))
  # Verificar que el documento no esté vacío
  if (nchar(transcription) > 0) {
    # Convertir el texto en un Corpus
    corpus <- Corpus(VectorSource(transcription))
    # Convertir el texto en un DocumentTermMatrix
    dtm <- DocumentTermMatrix(corpus)
    # Verificar que el DTM contenga al menos un término
    if (length(dtm$dimnames$Terms) > 0) {
      # Generar un modelo de topic models
      lda <- LDA(dtm, k = 5)
      # Obtener los términos más importantes de cada tópico
      terms <- tidy(lda, matrix = "beta") %>%
        group_by(term) %>%
        summarize(frequency = sum(beta))
      return(terms)
    }
  }
  return(NULL) # Si el documento no contiene datos válidos
}
# Analizar el sentimiento
sentiment.16 <- sentiment_by(transcription)
# Convertir el texto en un Corpus
corpus.as.16 <- Corpus(VectorSource(transcription))
# Convertir el texto en un DocumentTermMatrix
dtm.16 <- DocumentTermMatrix(corpus.as.16)
# Generar un modelo de topic models
lda.16 <- LDA(dtm.16, k = 3)

# Ruta de la carpeta que contiene los archivos de texto
carpeta_textos.16 <-  "/Users/ahtzirigarcia/Desktop/RETO/Vídeos de youtube/2016"

# Listar los archivos de texto en la carpeta
archivos_texto.16 <- list.files(path = carpeta_textos.16, pattern = "\\.txt$|\\.csv$", full.names = TRUE)

# Inicializar una lista para almacenar los resultados
resultados <- list()

# Iterar a través de los archivos y analizar cada uno
for (archivo in archivos_texto) {
  resultado <- analyze_text(archivo)
  if (!is.null(resultado)) {
    resultados[[archivo]] <- resultado
  }
}

# Combinar y filtrar los resultados de términos más frecuentes
resultados_combinados <- bind_rows(resultados)
top_terms <- resultados_combinados %>%
  group_by(term)

# Obtener los términos más importantes de cada tópico
terms <- tidy(lda, matrix = "beta")

#Para limpiar palabras del análisis de sentimiento: 

# Lista de términos a eliminar de la visualización
terminos_a_eliminar <- c("pues","persona","grupo","social","grupos")

# Ordenar los términos por beta de mayor a menor y seleccionar los 15 primeros
top_terms <- terms %>%
  filter(!term %in% terminos_a_eliminar) %>%
  arrange(desc(beta)) %>%
  top_n(15)

## Selecting by beta

# Agregar la información del tema a resultados_combinados
resultados_combinados <- resultados_combinados %>%
  right_join(top_terms, by = c("term" = "term"))

## Warning in right_join(., top_terms, by = c(term = "term")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 2386 of `x` matches multiple rows in `y`.
## ℹ Row 2 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

# Visualizar los términos más frecuentes con facet_wrap
ggplot(resultados_combinados, aes(x = reorder(term, beta), y = beta, fill = factor(topic))) +
  geom_col() +
  coord_flip() +
  facet_wrap(~ topic, ncol = 3) +
  labs(title = "Los términos más importantes en cada tópico del 2016",
       x = NULL, y = "Importancia") +
  scale_fill_manual(values = colores) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", vjust = -1)  # Ajustar el título en negritas y posición vertical
  ) +
  guides(fill = FALSE)  # Eliminar la leyenda de colores (fill)

Análisis del 2017

Palabras frecuentes por año (Wordclouds)

Carga de archivos

# Ruta de la carpeta donde están los archivos seleccionados
carpeta_destino <- "/Users/ahtzirigarcia/Desktop/RETO/Vídeos de youtube/2017"

# Listar todos los archivos en la carpeta de destino
archivos_en_carpeta <- list.files(path = carpeta_destino)

Limpieza de archivos

# Crear una función para limpiar el texto
limpiar_texto <- function(archivo) {
  # Leer el contenido del archivo
  contenido <- readLines(file.path(carpeta_destino, archivo), warn = FALSE)
  contenido <- paste(contenido, collapse = " ")
  
  # Eliminar URLs
  contenido <- gsub("http\\S+|www\\.\\S+", "", contenido)
  
  # Eliminar etiquetas HTML
  contenido <- gsub("<.*?>", "", contenido)
  
  # Eliminar puntuación
  contenido <- gsub("[[:punct:]]", "", contenido)
  
  # Eliminar números
  contenido <- gsub("\\d+", "", contenido)
  
  # Eliminar espacios extra
  contenido <- gsub("\\s+", " ", contenido)
  
  # Convertir a minúsculas
  contenido <- tolower(contenido)
  
  return(contenido)
}

# Aplicar la función de limpieza a cada archivo en la carpeta
textos_limpios <- lapply(archivos_en_carpeta, limpiar_texto)

Unimos en un mismo vector la función anterior

# Crear un vector llamado "Texto Limpio"
textos_limpios <- lapply(archivos_en_carpeta, limpiar_texto)

# Combinar los textos limpios en un solo vector
TextoLimpio <- unlist(textos_limpios)

Análisis Exploratorio en el que se crea la Matriz de Términos de Documento e identifica y lista los términos más frecuentes en el conjunto de datos.

library(tm)

# Crear un Corpus con los textos 
corpus <- Corpus(VectorSource(TextoLimpio))

#Documetn Term Matrix
dtm <- DocumentTermMatrix(corpus)

#Ver la Matriz
inspect(dtm)

## <<DocumentTermMatrix (documents: 16, terms: 13454)>>
## Non-/sparse entries: 21155/194109
## Sparsity           : 90%
## Maximal term length: 42
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs como con las los más para pero  por  que una
##   1     5  15  18  13  10    8    0   10   27  17
##   10    2  14  33  13   7    6    2   15   72  15
##   13   14  15  28  25   7    9   10   14   45  19
##   15   10  29  11  22  10   18    2    7   77  11
##   16    3  18  21  14   5   12    0   12   28  18
##   2   673 938 460 726 444  524  925 1550 4080 722
##   3     7  18  12  10   6   11   12   18  115  25
##   4   158 184 155 110 105  104  170  272  860 220
##   5    32  40  20  34  12   30   40   52  254  41
##   9    10  20  15  22  14   15    7    6   61   8

Se eliminan stopwords y palabras específicas que no funcionan al análisis

# Lista de palabras específicas no deseadas
palabras_no_deseadas.17 <- c("años","sé","aún","gracias","dianina","cómo","cada","bueno","italia","sólo","saludos","tal","ahí","hola","creo","vídeo","pues","ver","vez","dos","aquí","caso","dios","etc","hace","misma","veo","todas","día","año","voy","dice","así","ello","sino","casi","video","entiendo","iba","dia","toda","igual","decir","aun","canal","tan","asillevo","hago","super","siempre","ahora","hoy","cosa","entonces","videos","alguien","pasa","veces","ole","mas","gente","cosas","muchas","verdad","mismo", "puede")

# Eliminar stopwords de nuestro corpus
corpus_limpio <- tm_map(corpus, removeWords, stopwords("es"))

## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("es")):
## transformation drops documents

# Eliminar palabras específicas no deseadas
corpus_limpio.17 <- tm_map(corpus_limpio, removeWords, palabras_no_deseadas.17)

## Warning in tm_map.SimpleCorpus(corpus_limpio, removeWords,
## palabras_no_deseadas.17): transformation drops documents

# Crear el nuevo DocumentTermMatrix
dtm_limpio <- DocumentTermMatrix(corpus_limpio)

# Sumar las columnas para obtener el conteo total de cada término
conteo_total_limpio <- colSums(as.matrix(dtm_limpio))

# Ordenar y mostrar los términos más comunes
terminos_comunes_limpio <- sort(conteo_total_limpio, decreasing = TRUE)

# Obtener los 15 términos más comunes
top_15_terminos <- head(terminos_comunes_limpio, 15)

# Crear un data frame con los términos y sus conteos
data <- data.frame(Término = names(top_15_terminos), Conteo = top_15_terminos)

# Crear la gráfica de barras sin líneas de fondo
ggplot(data, aes(x = reorder(Término, -Conteo), y = Conteo)) +
  geom_bar(stat = "identity", fill = colores) +
  labs(title = "Los 15 términos más comunes en el discurso del 2017", x = NULL, y = "Frecuencia") +
  theme_minimal() + # Establecer un tema minimalista
  theme(panel.grid.major = element_blank(), # Eliminar líneas de la cuadrícula mayor
        panel.grid.minor = element_blank(), # Eliminar líneas de la cuadrícula menor
        axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold", vjust = -7)) # Ajustar la orientación del texto del eje x, centrar el título en negritas y ajustar la altura del título

# Tomar los 15 términos más comunes
terminos_top<- names(terminos_comunes_limpio)[1:15]

# Crear un vector con las frecuencias de estos términos
frecuencias_top <- terminos_comunes_limpio[1:15]


# Invertir el orden de los vectores
terminos_top <- rev(terminos_top)
frecuencias_top<- rev(frecuencias_top)

# Crear un gráfico de barras con colores y ajustes de tamaño de letra
barplot(frecuencias_top, 
        names.arg = terminos_top, 
        horiz = TRUE, 
        las = 1, 
        main = "Los 15 términos más comunes en el discurso del 2017", 
        xlab = "Frecuencia", 
        col = colores,  # Agregar los colores personalizados
        border = NA,     # Color de los bordes de las barras
        cex.names = .56,      # Tamaño de letra en el eje x
        cex.axis = .7,       # Tamaño de letra en los ejes
        cex.main = 1.2,       # Tamaño de letra en el título principal
        cex.lab = 0.9         # Tamaño de letra en las etiquetas de los ejes
)

# Conjunto de palabras que deseas contar
conjunto_palabras <- c("gordofobia", "discriminación", "belleza","mujeres")  # Agrega las palabras que desees

# Mostrar el conteo individual de cada palabra en el conjunto
for (palabra in conjunto_palabras) {
  conteo_palabra <- conteo_total_limpio[palabra]
  print(paste("La palabra", palabra, "aparece", conteo_palabra, "veces."))
}

## [1] "La palabra gordofobia aparece 24 veces."
## [1] "La palabra discriminación aparece 39 veces."
## [1] "La palabra belleza aparece 46 veces."
## [1] "La palabra mujeres aparece 72 veces."

Se crea la visualización de datos

library(wordcloud)
# Crear la nube de palabras
# se van a plotear los términos comunes (base) que aparecen mínimo de 20 veces y máximo 200. 
wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, min.freq = 10, max.words = 150, random.order = FALSE, rot.per = .3, colors = brewer.pal(8, "Dark2"))

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## problemas could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## sobrepeso could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## identificada could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## madre could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## niña could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## experiencia could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## comentarios could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## videos could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## encanta could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## momento could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## canal could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## entiendo could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## compartir could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## chica could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## digo could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## dijo could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## grande could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## saludos could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## mujeres could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## parte could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## comía could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## vídeos could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## niños could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## dice could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## muchísimo could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## tema could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## incluso could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## sabes could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## entonces could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## trabajo could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## dejar could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## poder could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## quería could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## super could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## bajar could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## palabras could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## importante could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio), terminos_comunes_limpio, :
## sociedad could not be fit on page. It will not be plotted.

Es importante observar que en este año Diana fue muy mencionada, ella es una activista body positive que tiene un canal de youtube llamado “DIANINA XL” en la que habla sobre la temática.

Análisis de sentimiento

library(ggplot2)
library(dplyr)
library(tidytext)

corpus_vector_2.17 <- unlist(sapply(corpus_limpio.17 , as.character))

#Análisis de sentimiento con el paquete nrc,asigna etiquetas de sentimiento (por ejemplo, positivo, negativo o neutro) a palabras en un texto basándose en un conjunto de datos predefinido que asocia palabras con emociones o sentimientos específicos. Finalmente, contamos los sentimientos utilizando count().


youtube.sentimientos.17 <- data.frame(text = corpus_vector_2.17) %>%
  unnest_tokens(word, text) %>%
     inner_join(get_sentiment_dictionary("nrc", language = "spanish")) %>%
  inner_join(get_sentiments("nrc")) %>%
  count(sentiment)

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiment_dictionary("nrc", language = "spanish")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 8 of `x` matches multiple rows in `y`.
## ℹ Row 824 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

## Joining with `by = join_by(word, sentiment)`

print (youtube.sentimientos.17)

##       sentiment   n
## 1         anger 214
## 2  anticipation  51
## 3       disgust 334
## 4          fear 341
## 5           joy  97
## 6      negative 630
## 7      positive 304
## 8       sadness 260
## 9      surprise  21
## 10        trust 146

# Obtener los nombres predeterminados de los sentimientos del diccionario NRC
nombres_sentimientos <- get_sentiments("nrc")

ggplot(youtube.sentimientos.17, aes(x = sentiment, y = n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  labs(title = "Distribución de Sentimientos en el discurso del 2017",
       x = NULL, y = "Frecuencia") +
  scale_fill_brewer(palette = "Set3") +  # Puedes elegir otra paleta de colores
  scale_x_discrete(labels = nombres_sentimientos) +  # Cambiar etiquetas de sentimientos a español
  theme_minimal() +
  theme(panel.grid.major = element_blank(),  # Eliminar líneas de la cuadrícula mayor
        panel.grid.minor = element_blank(),  # Eliminar líneas de la cuadrícula menor
        axis.text.x = element_text(face = "bold"),  # Etiquetas de eje x en negritas
        plot.title = element_text(hjust = 0.5, face = "bold", vjust = -3)) +  # Ajustar título del gráfico
  guides(fill = FALSE)  # Eliminar la leyenda de colores (fill)

Análisis de tópico

Ánalisis de solo un txt

# Importar las librerías necesarias
library(tidyverse)
library(tidytext)
library(dplyr)
library(sentimentr)
library(topicmodels)
library(tm)
library(ggplot2)

#MEJORA: realizar un loop o algoritmo para analizar todos los textos
# HINT: agrupen sus textos en txt NO LOS SOBREESCRIBAN UNO DEL OTRO
# Leer el archivo txt
transcription <- read_file("/Users/ahtzirigarcia/Desktop/RETO/Vídeos de youtube/2017/2017_noticia_51.txt")
# Preprocesar el texto
transcription <- gsub("\n", " ", transcription)
transcription <- tolower(transcription)
transcription <- removePunctuation(transcription)
transcription <- removeWords(transcription, stopwords("spanish"))
# Analizar el sentimiento
sentiment <- sentiment_by(transcription)
# Convertir el texto en un Corpus
corpus <- Corpus(VectorSource(transcription))
# Convertir el texto en un DocumentTermMatrix
dtm <- DocumentTermMatrix(corpus)
# Generar un modelo de topic models
lda <- LDA(dtm, k = 3)
# Obtener los términos más importantes de cada tópico
terms <- tidy(lda, matrix = "beta") %>%
group_by(topic) %>%
top_n(10, wt = beta)

Ánalisis de toda la carpeta

# Importar las librerías necesarias
library(tidyverse)
library(tidytext)
library(dplyr)
library(topicmodels)
library(tm)
library(ggplot2)

# Crear una función para analizar un archivo de texto
analyze_text <- function(file_path) {
  # Leer el archivo txt
  transcription <- read_file(file_path)
  # Preprocesar el texto
  transcription <- gsub("\n", " ", transcription)
  transcription <- tolower(transcription)
  transcription <- removePunctuation(transcription)
  transcription <- removeWords(transcription, stopwords("spanish"))
  # Verificar que el documento no esté vacío
  if (nchar(transcription) > 0) {
    # Convertir el texto en un Corpus
    corpus <- Corpus(VectorSource(transcription))
    # Convertir el texto en un DocumentTermMatrix
    dtm <- DocumentTermMatrix(corpus)
    # Verificar que el DTM contenga al menos un término
    if (length(dtm$dimnames$Terms) > 0) {
      # Generar un modelo de topic models
      lda <- LDA(dtm, k = 5)
      # Obtener los términos más importantes de cada tópico
      terms <- tidy(lda, matrix = "beta") %>%
        group_by(term) %>%
        summarize(frequency = sum(beta))
      return(terms)
    }
  }
  return(NULL) # Si el documento no contiene datos válidos
}

# Ruta de la carpeta que contiene los archivos de texto
carpeta_textos.17 <-  "/Users/ahtzirigarcia/Desktop/RETO/Vídeos de youtube/2017"

# Listar los archivos de texto en la carpeta
archivos_texto <- list.files(path = carpeta_textos.17, pattern = "\\.txt$|\\.csv$", full.names = TRUE)

# Inicializar una lista para almacenar los resultados
resultados <- list()

# Iterar a través de los archivos y analizar cada uno
for (archivo in archivos_texto) {
  resultado <- analyze_text(archivo)
  if (!is.null(resultado)) {
    resultados[[archivo]] <- resultado
  }
}

# Combinar y filtrar los resultados de términos más frecuentes
resultados_combinados <- bind_rows(resultados)
top_terms <- resultados_combinados %>%
  group_by(term)

# Obtener los términos más importantes de cada tópico
terms <- tidy(lda, matrix = "beta")

#Para limpiar palabras del análisis de sentimiento: 

# Lista de términos a eliminar de la visualización
terminos_a_eliminar <- c("quiere", "aprender", "comida'")


# Ordenar los términos por beta de mayor a menor y seleccionar los 15 primeros
top_terms <- terms %>%
  filter(!term %in% terminos_a_eliminar) %>%
  arrange(desc(beta)) %>%
  top_n(14)

## Selecting by beta

# Agregar la información del tema a resultados_combinados
resultados_combinados <- resultados_combinados %>%
  right_join(top_terms, by = c("term" = "term"))

## Warning in right_join(., top_terms, by = c(term = "term")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 4672 of `x` matches multiple rows in `y`.
## ℹ Row 8 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

# Visualizar los términos más frecuentes con facet_wrap
ggplot(resultados_combinados, aes(x = reorder(term, beta), y = beta, fill = factor(topic))) +
  geom_col() +
  coord_flip() +
  facet_wrap(~ topic, ncol = 3) +
  labs(title = "Los términos más importantes en cada tópico del 2017",
       x = NULL, y = "Importancia") +
  scale_fill_manual(values = colores) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", vjust = -1)  # Ajustar el título en negritas y posición vertical
  ) +
  guides(fill = FALSE)

Análisis del 2018

Palabras frecuentes por año (Wordclouds)

Carga de archivos:Muchos de los vídeos de este año que están en la red son emitidos por Brasil

# Ruta de la carpeta donde están los archivos seleccionados
carpeta_destino.18 <- "/Users/ahtzirigarcia/Desktop/RETO/Vídeos de youtube/2018"

# Listar todos los archivos en la carpeta de destino
archivos_en_carpeta.18 <- list.files(path = carpeta_destino.18)

Limpieza de archivos

# Crear una función para limpiar el texto
limpiar_texto.18 <- function(archivo.18) {
  # Leer el contenido del archivo
  contenido.18 <- readLines(file.path(carpeta_destino.18, archivo.18), warn = FALSE)
  contenido.18 <- paste(contenido.18, collapse = " ")
  
  # Eliminar URLs
  contenido.18 <- gsub("http\\S+|www\\.\\S+", "", contenido.18)
  
  # Eliminar etiquetas HTML
  contenido.18 <- gsub("<.*?>", "", contenido.18)
  
  # Eliminar puntuación
  contenido.18 <- gsub("[[:punct:]]", "", contenido.18)
  
  # Eliminar números
  contenido.18 <- gsub("\\d+", "", contenido.18)
  
  # Eliminar espacios extra
  contenido.18 <- gsub("\\s+", " ", contenido.18)
  
  # Convertir a minúsculas
  contenido.18 <- tolower(contenido.18)
  
  return(contenido.18)
}

# Aplicar la función de limpieza a cada archivo en la carpeta
textos_limpios.18 <- lapply(archivos_en_carpeta.18, limpiar_texto.18)

Unimos en un mismo vector la función anterior

# Crear un vector llamado "Texto Limpio"
textos_limpios.18 <- lapply(archivos_en_carpeta.18, limpiar_texto.18)

# Combinar los textos limpios en un solo vector
TextoLimpio.18 <- unlist(textos_limpios.18)

Análisis Exploratorio en el que se crea la Matriz de Términos de Documento e identifica y lista los términos más frecuentes en el conjunto de datos.

library(tm)

# Crear un Corpus con los textos 
corpus.18 <- Corpus(VectorSource(TextoLimpio.18))

#Documetn Term Matrix
dtm.18 <- DocumentTermMatrix(corpus.18)

Se eliminan stopwords y palabras específicas que no funcionan al análisis

# Lista de palabras específicas no deseadas
palabras_no_deseadas.18 <- c("no","gracias","siempre","así","maquis","hace","personas","tan","aquí","alguien","aún","dos","gente","creo","entonces","mas","bien","cosas","ahora","años","así","etc","tal","sino","vez","mira","ahí","día","cómo","pues","dijo","así","veo","entiendo","clarín","lozano","hacia","llega")

# Eliminar stopwords de nuestro corpus
corpus_limpio.18 <- tm_map(corpus.18, removeWords, stopwords("es"))

## Warning in tm_map.SimpleCorpus(corpus.18, removeWords, stopwords("es")):
## transformation drops documents

# Eliminar palabras específicas no deseadas
corpus_limpio.18<- tm_map(corpus_limpio.18, removeWords, palabras_no_deseadas.18)

## Warning in tm_map.SimpleCorpus(corpus_limpio.18, removeWords,
## palabras_no_deseadas.18): transformation drops documents

# Crear el nuevo DocumentTermMatrix
dtm_limpio.18 <- DocumentTermMatrix(corpus_limpio.18)

# Sumar las columnas para obtener el conteo total de cada término
conteo_total_limpio.18 <- colSums(as.matrix(dtm_limpio.18))

# Ordenar y mostrar los términos más comunes
terminos_comunes_limpio.18 <- sort(conteo_total_limpio.18, decreasing = TRUE)

library(ggplot2)

# Obtener los 15 términos más comunes
top_15_terminos.18 <- head(terminos_comunes_limpio.18, 15)

# Crear un data frame con los términos y sus conteos
data.18 <- data.frame(Término = names(top_15_terminos.18), Conteo = top_15_terminos.18)

# Obtener los 15 términos más comunes
top_15_terminos.18 <- head(terminos_comunes_limpio.18, 15)

# Crear un data frame con los términos y sus conteos
data.18 <- data.frame(Término = names(top_15_terminos.18), Conteo = top_15_terminos.18)

# Crear la gráfica de barras sin líneas de fondo
ggplot(data.18, aes(x = reorder(Término, -Conteo), y = Conteo)) +
  geom_bar(stat = "identity", fill = colores) +
  labs(title = "Los 15 términos más comunes en el discurso del 2018", x = NULL, y = "Frecuencia") +
  theme_minimal() + # Establecer un tema minimalista
  theme(panel.grid.major = element_blank(), # Eliminar líneas de la cuadrícula mayor
        panel.grid.minor = element_blank(), # Eliminar líneas de la cuadrícula menor
        axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold", vjust = -7)) # Ajustar la orientación del texto del eje x, centrar el título en negritas y ajustar la altura del título

# Conjunto de palabras que deseas contar
conjunto_palabras <- c("gordofobia", "discriminación", "belleza","mujeres")  # Agrega las palabras que desees

# Mostrar el conteo individual de cada palabra en el conjunto
for (palabra in conjunto_palabras) {
  conteo_palabra <- conteo_total_limpio.18[palabra]
  print(paste("La palabra", palabra, "aparece", conteo_palabra, "veces."))
}

## [1] "La palabra gordofobia aparece 56 veces."
## [1] "La palabra discriminación aparece 58 veces."
## [1] "La palabra belleza aparece 92 veces."
## [1] "La palabra mujeres aparece 603 veces."

Se crea la visualización de datos

library(wordcloud)
# Crear la nube de palabras
# se van a plotear los términos comunes (base) que aparecen mínimo de 20 veces y máximo 200. 
wordcloud(names(terminos_comunes_limpio.18), terminos_comunes_limpio.18, min.freq =20, max.words = 200, random.order = FALSE, rot.per = 0.2, colors = brewer.pal(8, "Dark2"))

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : sexualidad could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : ejemplo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : contenidos could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : hablando could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : mayoría could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : puedo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : vamos could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : feminidad could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : modelos could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : casos could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : embargo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : pasa could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : demás could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : física could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : historia could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : representaciones could not be fit on page. It
## will not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : saludable could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : autoestima could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : haciendo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : feminista could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : genial could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : gordita could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : amigos could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : mayor could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : pantalla could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : chicos could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : comunicación could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : conmigo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.18),
## terminos_comunes_limpio.18, : gustan could not be fit on page. It will not be
## plotted.

Análisis de sentimiento

library(ggplot2)
library(dplyr)
library(tidytext)

corpus_vector_2.18 <- unlist(sapply(corpus_limpio.18 , as.character))

#Análisis de sentimiento con el paquete nrc,asigna etiquetas de sentimiento (por ejemplo, positivo, negativo o neutro) a palabras en un texto basándose en un conjunto de datos predefinido que asocia palabras con emociones o sentimientos específicos. Finalmente, contamos los sentimientos utilizando count().


youtube.sentimientos.18 <- data.frame(text = corpus_vector_2.18) %>%
  unnest_tokens(word, text) %>%
  inner_join(get_sentiment_dictionary("nrc", language = "spanish")) %>%
  inner_join(get_sentiments("nrc")) %>%
  count(sentiment)

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiment_dictionary("nrc", language = "spanish")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 8 of `x` matches multiple rows in `y`.
## ℹ Row 2162 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

## Joining with `by = join_by(word, sentiment)`

print (youtube.sentimientos.18)

##       sentiment   n
## 1         anger 107
## 2  anticipation  54
## 3       disgust 104
## 4          fear 139
## 5           joy 106
## 6      negative 555
## 7      positive 533
## 8       sadness 309
## 9      surprise  31
## 10        trust 269

# Obtener los nombres predeterminados de los sentimientos del diccionario NRC
nombres_sentimientos <- get_sentiments("nrc")

ggplot(youtube.sentimientos.18, aes(x = sentiment, y = n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  labs(title = "Distribución de Sentimientos en el discurso del 2018",
       x = NULL, y = "Frecuencia") +
  scale_fill_brewer(palette = "Set3") +  # Puedes elegir otra paleta de colores
  scale_x_discrete(labels = nombres_sentimientos) +  # Cambiar etiquetas de sentimientos a español
  theme_minimal() +
  theme(panel.grid.major = element_blank(),  # Eliminar líneas de la cuadrícula mayor
        panel.grid.minor = element_blank(),  # Eliminar líneas de la cuadrícula menor
        axis.text.x = element_text(face = "bold"),  # Etiquetas de eje x en negritas
        plot.title = element_text(hjust = 0.5, face = "bold", vjust = -3)) +  # Ajustar título del gráfico
  guides(fill = FALSE)  # Eliminar la leyenda de colores (fill)

Análisis de tópico

Ánalisis de solo un txt

# Importar las librerías necesarias
library(tidyverse)
library(tidytext)
library(dplyr)
library(sentimentr)
library(topicmodels)
library(tm)
library(ggplot2)

#MEJORA: realizar un loop o algoritmo para analizar todos los textos
# HINT: agrupen sus textos en txt NO LOS SOBREESCRIBAN UNO DEL OTRO
# Leer el archivo txt
transcription <- read_file("//Users/ahtzirigarcia/Desktop/RETO/Vídeos de youtube/2018/2018_noticia_40.txt")
# Preprocesar el texto
transcription <- gsub("\n", " ", transcription)
transcription <- tolower(transcription)
transcription <- removePunctuation(transcription)
transcription <- removeWords(transcription, stopwords("spanish"))
# Analizar el sentimiento
sentiment <- sentiment_by(transcription)
# Convertir el texto en un Corpus
corpus <- Corpus(VectorSource(transcription))
# Convertir el texto en un DocumentTermMatrix
dtm <- DocumentTermMatrix(corpus)
# Generar un modelo de topic models
lda <- LDA(dtm, k = 3)
# Obtener los términos más importantes de cada tópico
terms <- tidy(lda, matrix = "beta") %>%
group_by(topic) %>%
top_n(10, wt = beta)

Ánalisis de toda la carpeta

# Importar las librerías necesarias
library(tidyverse)
library(tidytext)
library(dplyr)
library(topicmodels)
library(tm)
library(ggplot2)

# Crear una función para analizar un archivo de texto
analyze_text <- function(file_path) {
  # Leer el archivo txt
  transcription <- read_file(file_path)
  # Preprocesar el texto
  transcription <- gsub("\n", " ", transcription)
  transcription <- tolower(transcription)
  transcription <- removePunctuation(transcription)
  transcription <- removeWords(transcription, stopwords("spanish"))
  # Verificar que el documento no esté vacío
  if (nchar(transcription) > 0) {
    # Convertir el texto en un Corpus
    corpus <- Corpus(VectorSource(transcription))
    # Convertir el texto en un DocumentTermMatrix
    dtm <- DocumentTermMatrix(corpus)
    # Verificar que el DTM contenga al menos un término
    if (length(dtm$dimnames$Terms) > 0) {
      # Generar un modelo de topic models
      lda <- LDA(dtm, k = 5)
      # Obtener los términos más importantes de cada tópico
      terms <- tidy(lda, matrix = "beta") %>%
        group_by(term) %>%
        summarize(frequency = sum(beta))
      return(terms)
    }
  }
  return(NULL) # Si el documento no contiene datos válidos
}

# Ruta de la carpeta que contiene los archivos de texto
carpeta_textos.18 <-  "/Users/ahtzirigarcia/Desktop/RETO/Vídeos de youtube/2018"

# Listar los archivos de texto en la carpeta
archivos_texto <- list.files(path = carpeta_textos.18, pattern = "\\.txt$|\\.csv$", full.names = TRUE)

# Inicializar una lista para almacenar los resultados
resultados <- list()

# Iterar a través de los archivos y analizar cada uno
for (archivo in archivos_texto) {
  resultado <- analyze_text(archivo)
  if (!is.null(resultado)) {
    resultados[[archivo]] <- resultado
  }
}

# Combinar y filtrar los resultados de términos más frecuentes
resultados_combinados <- bind_rows(resultados)
top_terms <- resultados_combinados %>%
  group_by(term)

# Obtener los términos más importantes de cada tópico
terms <- tidy(lda, matrix = "beta")

#Para limpiar palabras del análisis de sentimiento: 

# Lista de términos a eliminar de la visualización
terminos_a_eliminar <- c("entiendo","clarín","lozano","hacia","llega")


# Ordenar los términos por beta de mayor a menor y seleccionar los 15 primeros
top_terms <- terms %>%
  filter(!term %in% terminos_a_eliminar) %>%
  arrange(desc(beta)) %>%
  top_n(12)

## Selecting by beta

# Agregar la información del tema a resultados_combinados
resultados_combinados <- resultados_combinados %>%
  right_join(top_terms, by = c("term" = "term"))

## Warning in right_join(., top_terms, by = c(term = "term")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 974 of `x` matches multiple rows in `y`.
## ℹ Row 5 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

# Visualizar los términos más frecuentes con facet_wrap
ggplot(resultados_combinados, aes(x = reorder(term, beta), y = beta, fill = factor(topic))) +
  geom_col() +
  coord_flip() +
  facet_wrap(~ topic, ncol = 3) +
  labs(title = "Los términos más importantes en cada tópico del 2018",
       x = NULL, y = "Importancia") +
  scale_fill_manual(values = colores) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", vjust = -1)  # Ajustar el título en negritas y posición vertical
  ) +
  guides(fill = FALSE)

Análisis del 2019

Palabras frecuentes por año (Wordclouds)

# Ruta de la carpeta donde están los archivos seleccionados
carpeta_destino.19 <- "/Users/ahtzirigarcia/Desktop/RETO/Vídeos de youtube/2019"

# Listar todos los archivos en la carpeta de destino
archivos_en_carpeta.19 <- list.files(path = carpeta_destino.19)

# Crear una función para limpiar el texto
limpiar_texto.19 <- function(archivo.19) {
  # Leer el contenido del archivo
  contenido.19 <- readLines(file.path(carpeta_destino.19, archivo.19), warn = FALSE)
  contenido.19 <- paste(contenido.19, collapse = " ")
  
  # Eliminar URLs
  contenido.19 <- gsub("http\\S+|www\\.\\S+", "", contenido.19)
  
  # Eliminar etiquetas HTML
  contenido.19 <- gsub("<.*?>", "", contenido.19)
  
  # Eliminar puntuación
  contenido.19 <- gsub("[[:punct:]]", "", contenido.19)
  
  # Eliminar números
  contenido.19 <- gsub("\\d+", "", contenido.19)
  
  # Eliminar espacios extra
  contenido.19 <- gsub("\\s+", " ", contenido.19)
  
  # Convertir a minúsculas
  contenido.19 <- tolower(contenido.19)
  
  return(contenido.19)
}

# Aplicar la función de limpieza a cada archivo en la carpeta
textos_limpios.19 <- lapply(archivos_en_carpeta.19, limpiar_texto.19)

# Crear un vector llamado "Texto Limpio"
textos_limpios.19 <- lapply(archivos_en_carpeta.19, limpiar_texto.19)

# Combinar los textos limpios en un solo vector
TextoLimpio.19 <- unlist(textos_limpios.19)

library(tm)

# Crear un Corpus con los textos 
corpus.19 <- Corpus(VectorSource(TextoLimpio.19))

#Documetn Term Matrix
dtm.19 <- DocumentTermMatrix(corpus.19)

# Lista de palabras específicas no deseadas
palabras_no_deseadas.19 <- c("ahora","así","mimi", "maher","corden",'"',"bill")

# Eliminar stopwords de nuestro corpus
corpus_limpio.19 <- tm_map(corpus.19, removeWords, stopwords("es"))

## Warning in tm_map.SimpleCorpus(corpus.19, removeWords, stopwords("es")):
## transformation drops documents

# Eliminar palabras específicas no deseadas
corpus_limpio.19<- tm_map(corpus_limpio.19, removeWords, palabras_no_deseadas.19)

## Warning in tm_map.SimpleCorpus(corpus_limpio.19, removeWords,
## palabras_no_deseadas.19): transformation drops documents

# Crear el nuevo DocumentTermMatrix
dtm_limpio.19 <- DocumentTermMatrix(corpus_limpio.19)

# Sumar las columnas para obtener el conteo total de cada término
conteo_total_limpio.19 <- colSums(as.matrix(dtm_limpio.19))

# Ordenar y mostrar los términos más comunes
terminos_comunes_limpio.19 <- sort(conteo_total_limpio.19, decreasing = TRUE)

library(ggplot2)

# Obtener los 15 términos más comunes
top_15_terminos.19 <- head(terminos_comunes_limpio.19, 15)

# Crear un data frame con los términos y sus conteos
data.19 <- data.frame(Término = names(top_15_terminos.19), Conteo = top_15_terminos.19)

# Crear la gráfica de barras sin líneas de fondo
ggplot(data.19, aes(x = reorder(Término, -Conteo), y = Conteo)) +
  geom_bar(stat = "identity", fill = colores) +
  labs(title = "Los 15 términos más comunes en el discurso del 2019", x = NULL, y = "Frecuencia") +
  theme_minimal() +
  theme(panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(), 
        axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold", vjust = -7))

# Conjunto de palabras que deseas contar
conjunto_palabras <- c("gordofobia", "discriminación", "belleza","mujeres")  # Agrega las palabras que desees

# Mostrar el conteo individual de cada palabra en el conjunto
for (palabra in conjunto_palabras) {
  conteo_palabra <- conteo_total_limpio.19[palabra]
  print(paste("La palabra", palabra, "aparece", conteo_palabra, "veces."))
}

## [1] "La palabra gordofobia aparece 32 veces."
## [1] "La palabra discriminación aparece 16 veces."
## [1] "La palabra belleza aparece 18 veces."
## [1] "La palabra mujeres aparece 46 veces."

Se crea la visualización de datos

library(wordcloud)
# Crear la nube de palabras
# se van a plotear los términos comunes (base) que aparecen mínimo de 3 veces y máximo 200. 
wordcloud(names(terminos_comunes_limpio.19), terminos_comunes_limpio.19, min.freq =2, max.words = 200, random.order = FALSE, rot.per = 0.3, colors = brewer.pal(6, "Dark2"))

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : mujeres could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : hecho could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : obesidad could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : videos could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : frase could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : saludos could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : baja could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : parte could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : hermosa could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : puedo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : ejemplo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : mujer could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : punto could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : quot could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : pasado could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : comiendo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : amiga could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : delgadas could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : luego could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : momento could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : peor could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : ejercicio could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : gordofobia could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : hablar could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : haciendo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : hombres could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : tema could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : comentario could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : grande could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : vídeos could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : algún could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : forma could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : odio could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : sigue could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : parece could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : cualquier could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : físico could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : trabajo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : después could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : haber could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : mira could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : acuerdo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : dejar could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : problemas could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : puedes could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : dieta could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : amor could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : llevo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : normal could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : buena could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : demasiado could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : digan could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : hacen could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : mucha could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : propia could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : súper could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : frases could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : saber could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : alimentación could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : amigos could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : falta could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : pequeña could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : derecho could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : gym could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : amigas could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio.19),
## terminos_comunes_limpio.19, : semana could not be fit on page. It will not be
## plotted.

Análisis de sentimiento

library(ggplot2)
library(dplyr)
library(tidytext)

corpus_vector_2.19 <- unlist(sapply(corpus_limpio.19 , as.character))

#Análisis de sentimiento con el paquete nrc,asigna etiquetas de sentimiento (por ejemplo, positivo, negativo o neutro) a palabras en un texto basándose en un conjunto de datos predefinido que asocia palabras con emociones o sentimientos específicos. Finalmente, contamos los sentimientos utilizando count().


youtube.sentimientos.19 <- data.frame(text = corpus_vector_2.19) %>%
  unnest_tokens(word, text) %>%
 inner_join(get_sentiment_dictionary("nrc", language = "spanish")) %>%
  inner_join(get_sentiments("nrc")) %>%
  count(sentiment)

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiment_dictionary("nrc", language = "spanish")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 13 of `x` matches multiple rows in `y`.
## ℹ Row 237 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

## Joining with `by = join_by(word, sentiment)`

print (youtube.sentimientos.19)

##       sentiment   n
## 1         anger  84
## 2  anticipation  27
## 3       disgust 119
## 4          fear 119
## 5           joy  40
## 6      negative 241
## 7      positive 143
## 8       sadness  99
## 9      surprise   4
## 10        trust  90

# Obtener los nombres predeterminados de los sentimientos del diccionario NRC
nombres_sentimientos.19 <- get_sentiments("nrc")

ggplot(youtube.sentimientos.19, aes(x = sentiment, y = n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  labs(title = "Distribución de Sentimientos en el discurso del 2019",
       x = NULL, y = "Frecuencia") +
  scale_fill_brewer(palette = "Set3") +  # Puedes elegir otra paleta de colores
  scale_x_discrete(labels = nombres_sentimientos) +  # Cambiar etiquetas de sentimientos a español
  theme_minimal() +
  theme(panel.grid.major = element_blank(),  # Eliminar líneas de la cuadrícula mayor
        panel.grid.minor = element_blank(),  # Eliminar líneas de la cuadrícula menor
        axis.text.x = element_text(face = "bold"),  # Etiquetas de eje x en negritas
        plot.title = element_text(hjust = 0.5, face = "bold", vjust = -3)) +  # Ajustar título del gráfico
  guides(fill = FALSE)  # Eliminar la leyenda de colores (fill)

Análisis de tópico

Ánalisis de solo un txt

# Importar las librerías necesarias
library(tidyverse)
library(tidytext)
library(dplyr)
library(sentimentr)
library(topicmodels)
library(tm)
library(ggplot2)

#MEJORA: realizar un loop o algoritmo para analizar todos los textos
# HINT: agrupen sus textos en txt NO LOS SOBREESCRIBAN UNO DEL OTRO
# Leer el archivo txt
transcription <- read_file("/Users/ahtzirigarcia/Desktop/RETO/Vídeos de youtube/2019/2019_noticia_38.txt")
# Preprocesar el texto
transcription <- gsub("\n", " ", transcription)
transcription <- tolower(transcription)
transcription <- removePunctuation(transcription)
transcription <- removeWords(transcription, stopwords("spanish"))
# Analizar el sentimiento
sentiment <- sentiment_by(transcription)
# Convertir el texto en un Corpus
corpus <- Corpus(VectorSource(transcription))
# Convertir el texto en un DocumentTermMatrix
dtm <- DocumentTermMatrix(corpus)
# Generar un modelo de topic models
lda <- LDA(dtm, k = 3)
# Obtener los términos más importantes de cada tópico
terms <- tidy(lda, matrix = "beta") %>%
group_by(topic) %>%
top_n(10, wt = beta)

Ánalisis de toda la carpeta

# Importar las librerías necesarias
library(tidyverse)
library(tidytext)
library(dplyr)
library(topicmodels)
library(tm)
library(ggplot2)

# Crear una función para analizar un archivo de texto
analyze_text <- function(file_path) {
  # Leer el archivo txt
  transcription <- read_file(file_path)
  # Preprocesar el texto
  transcription <- gsub("\n", " ", transcription)
  transcription <- tolower(transcription)
  transcription <- removePunctuation(transcription)
  transcription <- removeWords(transcription, stopwords("spanish"))
  # Verificar que el documento no esté vacío
  if (nchar(transcription) > 0) {
    # Convertir el texto en un Corpus
    corpus <- Corpus(VectorSource(transcription))
    # Convertir el texto en un DocumentTermMatrix
    dtm <- DocumentTermMatrix(corpus)
    # Verificar que el DTM contenga al menos un término
    if (length(dtm$dimnames$Terms) > 0) {
      # Generar un modelo de topic models
      lda <- LDA(dtm, k = 5)
      # Obtener los términos más importantes de cada tópico
      terms <- tidy(lda, matrix = "beta") %>%
        group_by(term) %>%
        summarize(frequency = sum(beta))
      return(terms)
    }
  }
  return(NULL) # Si el documento no contiene datos válidos
}

# Ruta de la carpeta que contiene los archivos de texto
carpeta_textos.18 <-  "/Users/ahtzirigarcia/Desktop/RETO/Vídeos de youtube/2019"

# Listar los archivos de texto en la carpeta
archivos_texto <- list.files(path = carpeta_textos.18, pattern = "\\.txt$|\\.csv$", full.names = TRUE)

# Inicializar una lista para almacenar los resultados
resultados <- list()

# Iterar a través de los archivos y analizar cada uno
for (archivo in archivos_texto) {
  resultado <- analyze_text(archivo)
  if (!is.null(resultado)) {
    resultados[[archivo]] <- resultado
  }
}

# Combinar y filtrar los resultados de términos más frecuentes
resultados_combinados <- bind_rows(resultados)
top_terms <- resultados_combinados %>%
  group_by(term)

# Obtener los términos más importantes de cada tópico
terms <- tidy(lda, matrix = "beta")

#Para limpiar palabras del análisis de sentimiento: 

# Lista de términos a eliminar de la visualización
terminos_a_eliminar <- c("maher","corden","bill", '“',"james")


# Ordenar los términos por beta de mayor a menor y seleccionar los 15 primeros
top_terms <- terms %>%
  filter(!term %in% terminos_a_eliminar) %>%
  arrange(desc(beta)) %>%
  top_n(12)

## Selecting by beta

# Agregar la información del tema a resultados_combinados
resultados_combinados <- resultados_combinados %>%
  right_join(top_terms, by = c("term" = "term"))

## Warning in right_join(., top_terms, by = c(term = "term")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1037 of `x` matches multiple rows in `y`.
## ℹ Row 4 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

# Visualizar los términos más frecuentes con facet_wrap
ggplot(resultados_combinados, aes(x = reorder(term, beta), y = beta, fill = factor(topic))) +
  geom_col() +
  coord_flip() +
  facet_wrap(~ topic, ncol = 3) +
  labs(title = "Los términos más importantes en cada tópico del 2019",
       x = NULL, y = "Importancia") +
  scale_fill_manual(values = colores) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", vjust = -1)  # Ajustar el título en negritas y posición vertical
  ) +
  guides(fill = FALSE)

Análisis del 2020

Palabras frecuentes por año (Wordclouds)

# Ruta de la carpeta donde están los archivos seleccionados para el año 2020
carpeta_destino_2020 <- "/Users/ahtzirigarcia/Desktop/RETO/Vídeos de youtube/2020"

# Listar todos los archivos en la carpeta del año 2020
archivos_en_carpeta_2020 <- list.files(path = carpeta_destino_2020)

# Crear una función para limpiar el texto para el año 2020
limpiar_texto_2020 <- function(archivo_2020) {
  contenido_2020 <- readLines(file.path(carpeta_destino_2020, archivo_2020), warn = FALSE)
  contenido_2020 <- paste(contenido_2020, collapse = " ")
  contenido_2020 <- gsub("http\\S+|www\\.\\S+", "", contenido_2020)
  contenido_2020 <- gsub("<.*?>", "", contenido_2020)
  contenido_2020 <- gsub("[[:punct:]]", "", contenido_2020)
  contenido_2020 <- gsub("\\d+", "", contenido_2020)
  contenido_2020 <- gsub("\\s+", " ", contenido_2020)
  contenido_2020 <- tolower(contenido_2020)
  return(contenido_2020)
}

# Aplicar la función de limpieza a cada archivo en la carpeta del año 2020
textos_limpios_2020 <- lapply(archivos_en_carpeta_2020, limpiar_texto_2020)

# Crear un vector llamado "Texto Limpio" para el año 2020
TextoLimpio_2020 <- unlist(textos_limpios_2020)

# Crear un Corpus con los textos del año 2020
corpus_2020 <- Corpus(VectorSource(TextoLimpio_2020))

# Crear el DocumentTermMatrix para el año 2020
dtm_2020 <- DocumentTermMatrix(corpus_2020)

# Lista de palabras específicas no deseadas para el año 2020
palabras_no_deseadas_2020 <- c("ser","feo","así","ahora","solo","bien","mal","hace","mas","tan","años","gente","uwu","etc","dia","vas","aún","así","ello","veo", "mejor","gusta","cosas","hacer","persona","verdad","vez","muchas","mismo","ver","creo","bueno","gracias","nunca","alguien","amo","misma")

# Eliminar stopwords de nuestro corpus del año 2020
corpus_limpio_2020 <- tm_map(corpus_2020, removeWords, stopwords("es"))

## Warning in tm_map.SimpleCorpus(corpus_2020, removeWords, stopwords("es")):
## transformation drops documents

# Eliminar palabras específicas no deseadas para el año 2020
corpus_limpio_2020 <- tm_map(corpus_limpio_2020, removeWords, palabras_no_deseadas_2020)

## Warning in tm_map.SimpleCorpus(corpus_limpio_2020, removeWords,
## palabras_no_deseadas_2020): transformation drops documents

# Crear el nuevo DocumentTermMatrix para el año 2020
dtm_limpio_2020 <- DocumentTermMatrix(corpus_limpio_2020)

# Sumar las columnas para obtener el conteo total de cada término para el año 2020
conteo_total_limpio_2020 <- colSums(as.matrix(dtm_limpio_2020))

# Ordenar y mostrar los términos más comunes para el año 2020
terminos_comunes_limpio_2020 <- sort(conteo_total_limpio_2020, decreasing = TRUE)

# Obtener los 15 términos más comunes para el año 2020
top_15_terminos_2020 <- head(terminos_comunes_limpio_2020, 15)

# Crear un data frame con los términos y sus conteos para el año 2020
data_2020 <- data.frame(Término = names(top_15_terminos_2020), Conteo = top_15_terminos_2020)

# Crear la gráfica de barras sin líneas de fondo para el año 2020
ggplot(data_2020, aes(x = reorder(Término, -Conteo), y = Conteo)) +
  geom_bar(stat = "identity", fill = colores) +
  labs(title = "Los 15 términos más comunes en el discurso del 2020", x = NULL, y = "Frecuencia") +
  theme_minimal() +
  theme(panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(), 
        axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold", vjust = -7))

# Conjunto de palabras que deseas contar
conjunto_palabras <- c("gordofobia", "discriminación", "belleza","mujeres")  # Agrega las palabras que desees

# Mostrar el conteo individual de cada palabra en el conjunto
for (palabra in conjunto_palabras) {
  conteo_palabra <- conteo_total_limpio_2020[palabra]
  print(paste("La palabra", palabra, "aparece", conteo_palabra, "veces."))
}

## [1] "La palabra gordofobia aparece 23 veces."
## [1] "La palabra discriminación aparece 26 veces."
## [1] "La palabra belleza aparece 218 veces."
## [1] "La palabra mujeres aparece 170 veces."

Se crea la visualización de datos

library(wordcloud)
# Crear la nube de palabras
wordcloud(names(terminos_comunes_limpio_2020), terminos_comunes_limpio_2020, min.freq =10, max.words = 350, random.order = FALSE, rot.per = 0.2, colors = brewer.pal(8, "Dark2"))

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : inseguridad could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : insegura could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : amigas could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : mucha could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : espejo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : super could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : propio could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : cambiar could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : mundo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : seguridad could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : caso could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : hacen could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : secundaria could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : sentía could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : dientes could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : feliz could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : decían could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : grande could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : momento could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : demasiado could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : mujer could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : luego could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : pensar could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : sentí could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : familia could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : importa could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : salud could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : problemas could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : tambien could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : baja could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : digo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : decía could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : colegio could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : peor could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : problema could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : niña could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : grandes could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : tema could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : espero could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : sos could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : cuerpos could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : dios could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : saludos could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : hizo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : buena could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : después could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : punto could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : miedo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : canción could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : bastante could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : pasado could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : paso could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : parece could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : tenia could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : tampoco could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : además could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : hacia could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : gran could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : identificada could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : cambio could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : poder could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : historia could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : dicho could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : encantó could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : quería could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : primera could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : cosa could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : sociedad could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : ayuda could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : realidad could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : gordo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : importante could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : dejar could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : primaria could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : movimiento could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : positivo could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : contigo could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : llama could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : cierto could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : hola could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : lindo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : súper could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : casa could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : hago could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : pienso could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : empecé could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : sólo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : seguir could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : pequeña could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : escuela could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : gustaba could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : positive could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : aquí could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : feos could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : bonito could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : delgada could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : quiere could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : gustan could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : lado could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : jajaja could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : quot could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : sentirme could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : algún could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : sigue could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : confianza could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : ejemplo could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : simplemente could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : llorar could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : apariencia could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : ustedes could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : piernas could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : mejorar could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : entiendo could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : sabes could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : consejos could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : palabras could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : comer could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : aveces could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : cabeza could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : chicos could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : visto could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : hermana could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : muchísimo could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : verme could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : van could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : primer could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : siente could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : contenido could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : encanto could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : jaja could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : bajo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : trabajo could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : haces could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : recuerdo could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : compañeros could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : estilo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : obesidad could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : encantan could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : enserio could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : viendo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : amarme could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : necesitaba could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : puedes could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : acuerdo could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : alguna could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : comentario could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : ganas could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : gusto could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : única could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : edad could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : inicio could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : mientras could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : normal could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : razón could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : hicieron could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : pasó could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : youtube could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : difícil could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : mente could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : hablando could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : saber could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : demas could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : sonrisa could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : haciendo could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : largo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : sociales could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : justo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : quisiera could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : clase could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : segura could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : unas could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : vivir could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : redes could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : amigo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : contrario could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : querer could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : pesar could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : dices could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : digan could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : hacía could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : maquillaje could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : padres could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : puse could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : uñas could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : feas could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : hombros could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : física could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : hombre could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : genial could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : novio could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : pueden could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : principio could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : chico could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : empezó could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : escuchar could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : nombre could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : admiro could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : actualmente could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : claro could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : primero could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : intro could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : pasar could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : real could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : aspecto could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : según could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : bajar could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : mayor could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : quieren could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : serio could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : trato could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : culpa could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : flaca could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : montón could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : pena could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : grupo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : hermoso could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : mismas could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : pensé could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : guapa could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : leer could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : hablo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : mala could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : trabajar could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : comencé could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : vergüenza could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : opinión could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : diciendo could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_2020),
## terminos_comunes_limpio_2020, : siquiera could not be fit on page. It will not
## be plotted.

Análisis de sentimiento

library(ggplot2)
library(dplyr)
library(tidytext)

corpus_vector_2.20 <- unlist(sapply(corpus_limpio_2020 , as.character))

#Análisis de sentimiento con el paquete nrc,asigna etiquetas de sentimiento (por ejemplo, positivo, negativo o neutro) a palabras en un texto basándose en un conjunto de datos predefinido que asocia palabras con emociones o sentimientos específicos. Finalmente, contamos los sentimientos utilizando count().


youtube.sentimientos.20 <- data.frame(text = corpus_vector_2.20) %>%
  unnest_tokens(word, text) %>%
   inner_join(get_sentiment_dictionary("nrc", language = "spanish")) %>% #necesitas cambiarlo al idioma necesario
  inner_join(get_sentiments("nrc")) %>%
  count(sentiment)

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiment_dictionary("nrc", language = "spanish")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 8 of `x` matches multiple rows in `y`.
## ℹ Row 937 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

## Joining with `by = join_by(word, sentiment)`

print (youtube.sentimientos.20)

##       sentiment    n
## 1         anger  474
## 2  anticipation   58
## 3       disgust  897
## 4          fear  784
## 5           joy  105
## 6      negative 1156
## 7      positive  469
## 8       sadness  143
## 9      surprise   28
## 10        trust  209

# Obtener los nombres predeterminados de los sentimientos del diccionario NRC
nombres_sentimientos <- get_sentiments("nrc")

ggplot(youtube.sentimientos.20, aes(x = sentiment, y = n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  labs(title = "Distribución de Sentimientos en el discurso del 2020",
       x = NULL, y = "Frecuencia") +
  scale_fill_brewer(palette = "Set3") +  # Puedes elegir otra paleta de colores
  scale_x_discrete(labels = nombres_sentimientos) +  # Cambiar etiquetas de sentimientos a español
  theme_minimal() +
  theme(panel.grid.major = element_blank(),  # Eliminar líneas de la cuadrícula mayor
        panel.grid.minor = element_blank(),  # Eliminar líneas de la cuadrícula menor
        axis.text.x = element_text(face = "bold"),  # Etiquetas de eje x en negritas
        plot.title = element_text(hjust = 0.5, face = "bold", vjust = -3)) +  # Ajustar título del gráfico
  guides(fill = FALSE)  # Eliminar la leyenda de colores (fill)

Análisis de tópico

Ánalisis de solo un txt

# Importar las librerías necesarias
library(tidyverse)
library(tidytext)
library(dplyr)
library(sentimentr)
library(topicmodels)
library(tm)
library(ggplot2)

#MEJORA: realizar un loop o algoritmo para analizar todos los textos
# HINT: agrupen sus textos en txt NO LOS SOBREESCRIBAN UNO DEL OTRO
# Leer el archivo txt
transcription <- read_file("/Users/ahtzirigarcia/Desktop/RETO/Vídeos de youtube/2020/2020_noticia_27.txt")
# Preprocesar el texto
transcription <- gsub("\n", " ", transcription)
transcription <- tolower(transcription)
transcription <- removePunctuation(transcription)
transcription <- removeWords(transcription, stopwords("spanish"))
# Analizar el sentimiento
sentiment <- sentiment_by(transcription)
# Convertir el texto en un Corpus
corpus <- Corpus(VectorSource(transcription))
# Convertir el texto en un DocumentTermMatrix
dtm <- DocumentTermMatrix(corpus)
# Generar un modelo de topic models
lda <- LDA(dtm, k = 3)
# Obtener los términos más importantes de cada tópico
terms <- tidy(lda, matrix = "beta") %>%
group_by(topic) %>%
top_n(10, wt = beta)

Ánalisis de toda la carpeta

# Importar las librerías necesarias
library(tidyverse)
library(tidytext)
library(dplyr)
library(topicmodels)
library(tm)
library(ggplot2)

# Crear una función para analizar un archivo de texto
analyze_text <- function(file_path) {
  # Leer el archivo txt
  transcription <- read_file(file_path)
  # Preprocesar el texto
  transcription <- gsub("\n", " ", transcription)
  transcription <- tolower(transcription)
  transcription <- removePunctuation(transcription)
  transcription <- removeWords(transcription, stopwords("spanish"))
  # Verificar que el documento no esté vacío
  if (nchar(transcription) > 0) {
    # Convertir el texto en un Corpus
    corpus <- Corpus(VectorSource(transcription))
    # Convertir el texto en un DocumentTermMatrix
    dtm <- DocumentTermMatrix(corpus)
    # Verificar que el DTM contenga al menos un término
    if (length(dtm$dimnames$Terms) > 0) {
      # Generar un modelo de topic models
      lda <- LDA(dtm, k = 5)
      # Obtener los términos más importantes de cada tópico
      terms <- tidy(lda, matrix = "beta") %>%
        group_by(term) %>%
        summarize(frequency = sum(beta))
      return(terms)
    }
  }
  return(NULL) # Si el documento no contiene datos válidos
}

# Ruta de la carpeta que contiene los archivos de texto
carpeta_textos.18 <-  "/Users/ahtzirigarcia/Desktop/RETO/Vídeos de youtube/2020"

# Listar los archivos de texto en la carpeta
archivos_texto <- list.files(path = carpeta_textos.18, pattern = "\\.txt$|\\.csv$", full.names = TRUE)

# Inicializar una lista para almacenar los resultados
resultados <- list()

# Iterar a través de los archivos y analizar cada uno
for (archivo in archivos_texto) {
  resultado <- analyze_text(archivo)
  if (!is.null(resultado)) {
    resultados[[archivo]] <- resultado
  }
}

# Combinar y filtrar los resultados de términos más frecuentes
resultados_combinados <- bind_rows(resultados)
top_terms <- resultados_combinados %>%
  group_by(term)

# Obtener los términos más importantes de cada tópico
terms <- tidy(lda, matrix = "beta")

#Para limpiar palabras del análisis de sentimiento: 

# Lista de términos a eliminar de la visualización
terminos_a_eliminar <- c("—","ana","morales","ojea","tatiana","radical")


# Ordenar los términos por beta de mayor a menor y seleccionar los 15 primeros
top_terms <- terms %>%
  filter(!term %in% terminos_a_eliminar) %>%
  arrange(desc(beta)) %>%
  top_n(12)

## Selecting by beta

# Agregar la información del tema a resultados_combinados
resultados_combinados <- resultados_combinados %>%
  right_join(top_terms, by = c("term" = "term"))

## Warning in right_join(., top_terms, by = c(term = "term")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 236 of `x` matches multiple rows in `y`.
## ℹ Row 12 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

# Visualizar los términos más frecuentes con facet_wrap
ggplot(resultados_combinados, aes(x = reorder(term, beta), y = beta, fill = factor(topic))) +
  geom_col() +
  coord_flip() +
  facet_wrap(~ topic, ncol = 3) +
  labs(title = "Los términos más importantes en cada tópico del 2020",
       x = NULL, y = "Importancia") +
  scale_fill_manual(values = colores) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", vjust = -1)  # Ajustar el título en negritas y posición vertical
  ) +
  guides(fill = FALSE)

Análisis del 2021

Palabras frecuentes por año (Wordclouds)

# Ruta de la carpeta donde están los archivos seleccionados para el año 2021
carpeta_destino_21 <- "/Users/ahtzirigarcia/Desktop/RETO/Vídeos de youtube/2021"

# Listar todos los archivos en la carpeta del año 2021
archivos_en_carpeta_21 <- list.files(path = carpeta_destino_21)

# Crear una función para limpiar el texto para el año 2021
limpiar_texto_21 <- function(archivo_21) {
  contenido_21 <- readLines(file.path(carpeta_destino_21, archivo_21), warn = FALSE)
  contenido_21 <- paste(contenido_21, collapse = " ")
  contenido_21 <- gsub("http\\S+|www\\.\\S+", "", contenido_21)
  contenido_21 <- gsub("<.*?>", "", contenido_21)
  contenido_21 <- gsub("[[:punct:]]", "", contenido_21)
  contenido_21 <- gsub("\\d+", "", contenido_21)
  contenido_21 <- gsub("\\s+", " ", contenido_21)
  contenido_21 <- tolower(contenido_21)
  return(contenido_21)
}

# Aplicar la función de limpieza a cada archivo en la carpeta del año 2021
textos_limpios_21 <- lapply(archivos_en_carpeta_21, limpiar_texto_21)

# Crear un vector llamado "Texto Limpio" para el año 2021
TextoLimpio_21 <- unlist(textos_limpios_21)

library(tm)

# Crear un Corpus con los textos del año 2021
corpus_21 <- Corpus(VectorSource(TextoLimpio_21))

# Crear el DocumentTermMatrix para el año 2021
dtm_21 <- DocumentTermMatrix(corpus_21)

# Lista de palabras específicas no deseadas para el año 2021
palabras_no_deseadas_21 <- c("solo","bien","ver","creo","ahora","ser","así","gracias","decir","tener","años","hola","dije","final")

# Eliminar stopwords de nuestro corpus del año 2021
corpus_limpio_21 <- tm_map(corpus_21, removeWords, stopwords("es"))

## Warning in tm_map.SimpleCorpus(corpus_21, removeWords, stopwords("es")):
## transformation drops documents

# Eliminar palabras específicas no deseadas para el año 2021
corpus_limpio_21 <- tm_map(corpus_limpio_21, removeWords, palabras_no_deseadas_21)

## Warning in tm_map.SimpleCorpus(corpus_limpio_21, removeWords,
## palabras_no_deseadas_21): transformation drops documents

# Crear el nuevo DocumentTermMatrix para el año 2021
dtm_limpio_21 <- DocumentTermMatrix(corpus_limpio_21)

# Sumar las columnas para obtener el conteo total de cada término para el año 2021
conteo_total_limpio_21 <- colSums(as.matrix(dtm_limpio_21))

# Ordenar y mostrar los términos más comunes para el año 2021
terminos_comunes_limpio_21 <- sort(conteo_total_limpio_21, decreasing = TRUE)

library(ggplot2)

# Obtener los 15 términos más comunes para el año 2021
top_15_terminos_21 <- head(terminos_comunes_limpio_21, 15)

# Crear un data frame con los términos y sus conteos para el año 2021
data_21 <- data.frame(Término = names(top_15_terminos_21), Conteo = top_15_terminos_21)

# Crear la gráfica de barras sin líneas de fondo para el año 2021
ggplot(data_21, aes(x = reorder(Término, -Conteo), y = Conteo)) +
  geom_bar(stat = "identity", fill = colores) +
  labs(title = "Los 15 términos más comunes en el discurso del 2021", x = NULL, y = "Frecuencia") +
  theme_minimal() +
  theme(panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(), 
        axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold", vjust = -7))

# Conjunto de palabras que deseas contar
conjunto_palabras <- c("gordofobia", "discriminación", "belleza","mujeres")  # Agrega las palabras que desees

# Mostrar el conteo individual de cada palabra en el conjunto
for (palabra in conjunto_palabras) {
  conteo_palabra <- conteo_total_limpio_21[palabra]
  print(paste("La palabra", palabra, "aparece", conteo_palabra, "veces."))
}

## [1] "La palabra gordofobia aparece 62 veces."
## [1] "La palabra discriminación aparece 51 veces."
## [1] "La palabra belleza aparece 515 veces."
## [1] "La palabra mujeres aparece 370 veces."

Wordcloud

library(wordcloud)
# Crear la nube de palabras
wordcloud(names(terminos_comunes_limpio_21), terminos_comunes_limpio_21, min.freq =23, max.words = 200, random.order = FALSE, rot.per = 0.3, colors = brewer.pal(8, "Dark2"))

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : excelente could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : movimiento could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : delgado could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : poder could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : debería could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : pensar could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : dentro could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : película could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : quieren could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : dices could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : buena could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : temas could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : bodypositive could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : bonita could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : aceptar could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : cualquier could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : seguir could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : hombre could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : comida could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : dietas could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : mensaje could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : simplemente could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : querer could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : vamos could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : super could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : maquis could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : llegar could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : estereotipos could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : grande could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : anuncio could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : normal could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : existe could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_21),
## terminos_comunes_limpio_21, : diciendo could not be fit on page. It will not be
## plotted.

Análisis de sentimiento

library(ggplot2)
library(dplyr)
library(tidytext)

corpus_vector_2.21 <- unlist(sapply(corpus_limpio_21 , as.character))

#Análisis de sentimiento con el paquete nrc,asigna etiquetas de sentimiento (por ejemplo, positivo, negativo o neutro) a palabras en un texto basándose en un conjunto de datos predefinido que asocia palabras con emociones o sentimientos específicos. Finalmente, contamos los sentimientos utilizando count().


youtube.sentimientos.21 <- data.frame(text = corpus_vector_2.21) %>%
  unnest_tokens(word, text) %>%
   inner_join(get_sentiment_dictionary("nrc", language = "spanish")) %>%
  inner_join(get_sentiments("nrc")) %>%
  count(sentiment)

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiment_dictionary("nrc", language = "spanish")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 17 of `x` matches multiple rows in `y`.
## ℹ Row 680 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

## Joining with `by = join_by(word, sentiment)`

print (youtube.sentimientos.21)

##       sentiment   n
## 1         anger 289
## 2  anticipation 102
## 3       disgust 465
## 4          fear 444
## 5           joy 132
## 6      negative 932
## 7      positive 707
## 8       sadness 336
## 9      surprise  43
## 10        trust 387

print (youtube.sentimientos.21)

##       sentiment   n
## 1         anger 289
## 2  anticipation 102
## 3       disgust 465
## 4          fear 444
## 5           joy 132
## 6      negative 932
## 7      positive 707
## 8       sadness 336
## 9      surprise  43
## 10        trust 387

# Obtener los nombres predeterminados de los sentimientos del diccionario NRC
nombres_sentimientos <- get_sentiments("nrc")

ggplot(youtube.sentimientos.21, aes(x = sentiment, y = n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  labs(title = "Distribución de Sentimientos en el discurso del 2021",
       x = NULL, y = "Frecuencia") +
  scale_fill_brewer(palette = "Set3") +  # Puedes elegir otra paleta de colores
  scale_x_discrete(labels = nombres_sentimientos) +  # Cambiar etiquetas de sentimientos a español
  theme_minimal() +
  theme(panel.grid.major = element_blank(),  # Eliminar líneas de la cuadrícula mayor
        panel.grid.minor = element_blank(),  # Eliminar líneas de la cuadrícula menor
        axis.text.x = element_text(face = "bold"),  # Etiquetas de eje x en negritas
        plot.title = element_text(hjust = 0.5, face = "bold", vjust = -3)) +  # Ajustar título del gráfico
  guides(fill = FALSE)  # Eliminar la leyenda de colores (fill)

Análisis de tópico

Ánalisis de solo un txt

# Importar las librerías necesarias
library(tidyverse)
library(tidytext)
library(dplyr)
library(sentimentr)
library(topicmodels)
library(tm)
library(ggplot2)

#MEJORA: realizar un loop o algoritmo para analizar todos los textos
# HINT: agrupen sus textos en txt NO LOS SOBREESCRIBAN UNO DEL OTRO
# Leer el archivo txt
transcription <- read_file("/Users/ahtzirigarcia/Desktop/RETO/Vídeos de youtube/2021/2021_noticia_18.txt")
# Preprocesar el texto
transcription <- gsub("\n", " ", transcription)
transcription <- tolower(transcription)
transcription <- removePunctuation(transcription)
transcription <- removeWords(transcription, stopwords("spanish"))
# Analizar el sentimiento
sentiment <- sentiment_by(transcription)
# Convertir el texto en un Corpus
corpus <- Corpus(VectorSource(transcription))
# Convertir el texto en un DocumentTermMatrix
dtm <- DocumentTermMatrix(corpus)
# Generar un modelo de topic models
lda <- LDA(dtm, k = 3)
# Obtener los términos más importantes de cada tópico
terms <- tidy(lda, matrix = "beta") %>%
group_by(topic) %>%
top_n(10, wt = beta)

Ánalisis de toda la carpeta

# Importar las librerías necesarias
library(tidyverse)
library(tidytext)
library(dplyr)
library(topicmodels)
library(tm)
library(ggplot2)

# Crear una función para analizar un archivo de texto
analyze_text <- function(file_path) {
  # Leer el archivo txt
  transcription <- read_file(file_path)
  # Preprocesar el texto
  transcription <- gsub("\n", " ", transcription)
  transcription <- tolower(transcription)
  transcription <- removePunctuation(transcription)
  transcription <- removeWords(transcription, stopwords("spanish"))
  # Verificar que el documento no esté vacío
  if (nchar(transcription) > 0) {
    # Convertir el texto en un Corpus
    corpus <- Corpus(VectorSource(transcription))
    # Convertir el texto en un DocumentTermMatrix
    dtm <- DocumentTermMatrix(corpus)
    # Verificar que el DTM contenga al menos un término
    if (length(dtm$dimnames$Terms) > 0) {
      # Generar un modelo de topic models
      lda <- LDA(dtm, k = 5)
      # Obtener los términos más importantes de cada tópico
      terms <- tidy(lda, matrix = "beta") %>%
        group_by(term) %>%
        summarize(frequency = sum(beta))
      return(terms)
    }
  }
  return(NULL) # Si el documento no contiene datos válidos
}

# Ruta de la carpeta que contiene los archivos de texto
carpeta_textos.18 <-  "/Users/ahtzirigarcia/Desktop/RETO/Vídeos de youtube/2021"

# Listar los archivos de texto en la carpeta
archivos_texto <- list.files(path = carpeta_textos.18, pattern = "\\.txt$|\\.csv$", full.names = TRUE)

# Inicializar una lista para almacenar los resultados
resultados <- list()

# Iterar a través de los archivos y analizar cada uno
for (archivo in archivos_texto) {
  resultado <- analyze_text(archivo)
  if (!is.null(resultado)) {
    resultados[[archivo]] <- resultado
  }
}

# Combinar y filtrar los resultados de términos más frecuentes
resultados_combinados <- bind_rows(resultados)
top_terms <- resultados_combinados %>%
  group_by(term)

# Obtener los términos más importantes de cada tópico
terms <- tidy(lda, matrix = "beta")

#Para limpiar palabras del análisis de sentimiento: 

# Lista de términos a eliminar de la visualización
terminos_a_eliminar <- c("zelma", '“',"azteca","survivor")


# Ordenar los términos por beta de mayor a menor y seleccionar los 15 primeros
top_terms <- terms %>%
  filter(!term %in% terminos_a_eliminar) %>%
  arrange(desc(beta)) %>%
  top_n(12)

## Selecting by beta

# Agregar la información del tema a resultados_combinados
resultados_combinados <- resultados_combinados %>%
  right_join(top_terms, by = c("term" = "term"))

## Warning in right_join(., top_terms, by = c(term = "term")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 178 of `x` matches multiple rows in `y`.
## ℹ Row 7 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

# Visualizar los términos más frecuentes con facet_wrap
ggplot(resultados_combinados, aes(x = reorder(term, beta), y = beta, fill = factor(topic))) +
  geom_col() +
  coord_flip() +
  facet_wrap(~ topic, ncol = 3) +
  labs(title = "Los términos más importantes en cada tópico del 2021",
       x = NULL, y = "Importancia") +
  scale_fill_manual(values = colores) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", vjust = -1)  # Ajustar el título en negritas y posición vertical
  ) +
  guides(fill = FALSE)

Análisis del 2022

Palabras frecuentes por año (Wordclouds)

# Ruta de la carpeta donde están los archivos seleccionados para el año 2022
carpeta_destino_22 <- "/Users/ahtzirigarcia/Desktop/RETO/Vídeos de youtube/2022"

# Listar todos los archivos en la carpeta del año 2022
archivos_en_carpeta_22 <- list.files(path = carpeta_destino_22)

# Crear una función para limpiar el texto para el año 2022
limpiar_texto_22 <- function(archivo_22) {
  contenido_22 <- readLines(file.path(carpeta_destino_22, archivo_22), warn = FALSE)
  contenido_22 <- paste(contenido_22, collapse = " ")
  contenido_22 <- gsub("http\\S+|www\\.\\S+", "", contenido_22)
  contenido_22 <- gsub("<.*?>", "", contenido_22)
  contenido_22 <- gsub("[[:punct:]]", "", contenido_22)
  contenido_22 <- gsub("\\d+", "", contenido_22)
  contenido_22 <- gsub("\\s+", " ", contenido_22)
  contenido_22 <- tolower(contenido_22)
  return(contenido_22)
}

# Aplicar la función de limpieza a cada archivo en la carpeta del año 2022
textos_limpios_22 <- lapply(archivos_en_carpeta_22, limpiar_texto_22)

# Crear un vector llamado "Texto Limpio" para el año 2022
TextoLimpio_22 <- unlist(textos_limpios_22)

library(tm)

# Crear un Corpus con los textos del año 2022
corpus_22 <- Corpus(VectorSource(TextoLimpio_22))

# Crear el DocumentTermMatrix para el año 2022
dtm_22 <- DocumentTermMatrix(corpus_22)

# Lista de palabras específicas no deseadas para el año 2022
palabras_no_deseadas_22 <- c("bien","gente","solo","personas","así","años","creo","nacional","gracias","hace","siento","video","ver","ahora","cosas","pues","bueno","siempre","persona","decir","entonces","mal","día","verdad","tal","ahí","sino","través","fin","dar","sé","días","dicho","chicahacia","cómo","vez","cabo","veo","nadiediscriminación","tan","dijo","dos","claro")

# Eliminar stopwords de nuestro corpus del año 2022
corpus_limpio_22 <- tm_map(corpus_22, removeWords, stopwords("es"))

## Warning in tm_map.SimpleCorpus(corpus_22, removeWords, stopwords("es")):
## transformation drops documents

# Eliminar palabras específicas no deseadas para el año 2022
corpus_limpio_22 <- tm_map(corpus_limpio_22, removeWords, palabras_no_deseadas_22)

## Warning in tm_map.SimpleCorpus(corpus_limpio_22, removeWords,
## palabras_no_deseadas_22): transformation drops documents

# Crear el nuevo DocumentTermMatrix para el año 2022
dtm_limpio_22 <- DocumentTermMatrix(corpus_limpio_22)

# Sumar las columnas para obtener el conteo total de cada término para el año 2022
conteo_total_limpio_22 <- colSums(as.matrix(dtm_limpio_22))

# Ordenar y mostrar los términos más comunes para el año 2022
terminos_comunes_limpio_22 <- sort(conteo_total_limpio_22, decreasing = TRUE)

library(ggplot2)

# Obtener los 15 términos más comunes para el año 2022
top_15_terminos_22 <- head(terminos_comunes_limpio_22, 15)

# Crear un data frame con los términos y sus conteos para el año 2022
data_22 <- data.frame(Término = names(top_15_terminos_22), Conteo = top_15_terminos_22)

# Crear la gráfica de barras sin líneas de fondo para el año 2022
ggplot(data_22, aes(x = reorder(Término, -Conteo), y = Conteo)) +
  geom_bar(stat = "identity", fill = colores) +
  labs(title = "Los 15 términos más comunes en el discurso del 2022", x = NULL, y = "Frecuencia") +
  theme_minimal() +
  theme(panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(), 
        axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold", vjust = -7))

# Conjunto de palabras que deseas contar
conjunto_palabras <- c("gordofobia", "discriminación", "belleza","mujeres")  # Agrega las palabras que desees

# Mostrar el conteo individual de cada palabra en el conjunto
for (palabra in conjunto_palabras) {
  conteo_palabra <- conteo_total_limpio_22[palabra]
  print(paste("La palabra", palabra, "aparece", conteo_palabra, "veces."))
}

## [1] "La palabra gordofobia aparece 79 veces."
## [1] "La palabra discriminación aparece 309 veces."
## [1] "La palabra belleza aparece 464 veces."
## [1] "La palabra mujeres aparece 1635 veces."

Wordcloud

library(wordcloud)
# Crear la nube de palabras
wordcloud(names(terminos_comunes_limpio_22), terminos_comunes_limpio_22, min.freq =40, max.words = 150, random.order = FALSE, rot.per = 0.4, colors = brewer.pal(8, "Dark2"))

Análisis de sentimiento

library(ggplot2)
library(dplyr)
library(tidytext)

corpus_vector_2.22 <- unlist(sapply(corpus_limpio_22 , as.character))

#Análisis de sentimiento con el paquete nrc,asigna etiquetas de sentimiento (por ejemplo, positivo, negativo o neutro) a palabras en un texto basándose en un conjunto de datos predefinido que asocia palabras con emociones o sentimientos específicos. Finalmente, contamos los sentimientos utilizando count().


youtube.sentimientos.22 <- data.frame(text = corpus_vector_2.22) %>%
  unnest_tokens(word, text) %>%
   inner_join(get_sentiment_dictionary("nrc", language = "spanish")) %>%
  inner_join(get_sentiments("nrc")) %>%
  count(sentiment)

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiment_dictionary("nrc", language = "spanish")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 9 of `x` matches multiple rows in `y`.
## ℹ Row 686 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

## Joining with `by = join_by(word, sentiment)`

print (youtube.sentimientos.22)

##       sentiment    n
## 1         anger  313
## 2  anticipation  256
## 3       disgust  534
## 4          fear  521
## 5           joy  180
## 6      negative 1336
## 7      positive 1438
## 8       sadness  510
## 9      surprise  135
## 10        trust  970

# Obtener los nombres predeterminados de los sentimientos del diccionario NRC
nombres_sentimientos <- get_sentiments("nrc")

ggplot(youtube.sentimientos.20, aes(x = sentiment, y = n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  labs(title = "Distribución de Sentimientos en el discurso del 2022",
       x = NULL, y = "Frecuencia") +
  scale_fill_brewer(palette = "Set3") +  # Puedes elegir otra paleta de colores
  scale_x_discrete(labels = nombres_sentimientos) +  # Cambiar etiquetas de sentimientos a español
  theme_minimal() +
  theme(panel.grid.major = element_blank(),  # Eliminar líneas de la cuadrícula mayor
        panel.grid.minor = element_blank(),  # Eliminar líneas de la cuadrícula menor
        axis.text.x = element_text(face = "bold"),  # Etiquetas de eje x en negritas
        plot.title = element_text(hjust = 0.5, face = "bold", vjust = -3)) +  # Ajustar título del gráfico
  guides(fill = FALSE)  # Eliminar la leyenda de colores (fill)

Análisis de tópico

Ánalisis de solo un txt

# Importar las librerías necesarias
library(tidyverse)
library(tidytext)
library(dplyr)
library(sentimentr)
library(topicmodels)
library(tm)
library(ggplot2)

#MEJORA: realizar un loop o algoritmo para analizar todos los textos
# HINT: agrupen sus textos en txt NO LOS SOBREESCRIBAN UNO DEL OTRO
# Leer el archivo txt
transcription <- read_file("/Users/ahtzirigarcia/Desktop/RETO/Vídeos de youtube/2022/2022_noticia_7.txt")
# Preprocesar el texto
transcription <- gsub("\n", " ", transcription)
transcription <- tolower(transcription)
transcription <- removePunctuation(transcription)
transcription <- removeWords(transcription, stopwords("spanish"))
# Analizar el sentimiento
sentiment <- sentiment_by(transcription)
# Convertir el texto en un Corpus
corpus <- Corpus(VectorSource(transcription))
# Convertir el texto en un DocumentTermMatrix
dtm <- DocumentTermMatrix(corpus)
# Generar un modelo de topic models
lda <- LDA(dtm, k = 3)
# Obtener los términos más importantes de cada tópico
terms <- tidy(lda, matrix = "beta") %>%
group_by(topic) %>%
top_n(10, wt = beta)

Ánalisis de toda la carpeta

# Importar las librerías necesarias
library(tidyverse)
library(tidytext)
library(dplyr)
library(topicmodels)
library(tm)
library(ggplot2)

# Crear una función para analizar un archivo de texto
analyze_text <- function(file_path) {
  # Leer el archivo txt
  transcription <- read_file(file_path)
  # Preprocesar el texto
  transcription <- gsub("\n", " ", transcription)
  transcription <- tolower(transcription)
  transcription <- removePunctuation(transcription)
  transcription <- removeWords(transcription, stopwords("spanish"))
  # Verificar que el documento no esté vacío
  if (nchar(transcription) > 0) {
    # Convertir el texto en un Corpus
    corpus <- Corpus(VectorSource(transcription))
    # Convertir el texto en un DocumentTermMatrix
    dtm <- DocumentTermMatrix(corpus)
    # Verificar que el DTM contenga al menos un término
    if (length(dtm$dimnames$Terms) > 0) {
      # Generar un modelo de topic models
      lda <- LDA(dtm, k = 5)
      # Obtener los términos más importantes de cada tópico
      terms <- tidy(lda, matrix = "beta") %>%
        group_by(term) %>%
        summarize(frequency = sum(beta))
      return(terms)
    }
  }
  return(NULL) # Si el documento no contiene datos válidos
}

# Ruta de la carpeta que contiene los archivos de texto
carpeta_textos.18 <-  "/Users/ahtzirigarcia/Desktop/RETO/Vídeos de youtube/2022"

# Listar los archivos de texto en la carpeta
archivos_texto <- list.files(path = carpeta_textos.18, pattern = "\\.txt$|\\.csv$", full.names = TRUE)

# Inicializar una lista para almacenar los resultados
resultados <- list()

# Iterar a través de los archivos y analizar cada uno
for (archivo in archivos_texto) {
  resultado <- analyze_text(archivo)
  if (!is.null(resultado)) {
    resultados[[archivo]] <- resultado
  }
}

# Combinar y filtrar los resultados de términos más frecuentes
resultados_combinados <- bind_rows(resultados)
top_terms <- resultados_combinados %>%
  group_by(term)

# Obtener los términos más importantes de cada tópico
terms <- tidy(lda, matrix = "beta")

#Para limpiar palabras del análisis de sentimiento: 

# Lista de términos a eliminar de la visualización
terminos_a_eliminar <- c("inadi","día")


# Ordenar los términos por beta de mayor a menor y seleccionar los 15 primeros
top_terms <- terms %>%
  filter(!term %in% terminos_a_eliminar) %>%
  arrange(desc(beta)) %>%
  top_n(12)

## Selecting by beta

# Agregar la información del tema a resultados_combinados
resultados_combinados <- resultados_combinados %>%
  right_join(top_terms, by = c("term" = "term"))

## Warning in right_join(., top_terms, by = c(term = "term")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1709 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

# Visualizar los términos más frecuentes con facet_wrap
ggplot(resultados_combinados, aes(x = reorder(term, beta), y = beta, fill = factor(topic))) +
  geom_col() +
  coord_flip() +
  facet_wrap(~ topic, ncol = 3) +
  labs(title = "Los términos más importantes en cada tópico del 2022",
       x = NULL, y = "Importancia") +
  scale_fill_manual(values = colores) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", vjust = -1)  # Ajustar el título en negritas y posición vertical
  ) +
  guides(fill = FALSE)

Análisis del 2023

Palabras frecuentes por año (Wordclouds)

# Ruta de la carpeta donde están los archivos seleccionados para el año 2023
carpeta_destino_23 <- "/Users/ahtzirigarcia/Desktop/RETO/Vídeos de youtube/2023"

# Listar todos los archivos en la carpeta del año 2023
archivos_en_carpeta_23 <- list.files(path = carpeta_destino_23)

# Crear una función para limpiar el texto para el año 2023
limpiar_texto_23 <- function(archivo_23) {
  contenido_23 <- readLines(file.path(carpeta_destino_23, archivo_23), warn = FALSE)
  contenido_23 <- paste(contenido_23, collapse = " ")
  contenido_23 <- gsub("http\\S+|www\\.\\S+", "", contenido_23)
  contenido_23 <- gsub("<.*?>", "", contenido_23)
  contenido_23 <- gsub("[[:punct:]]", "", contenido_23)
  contenido_23 <- gsub("\\d+", "", contenido_23)
  contenido_23 <- gsub("\\s+", " ", contenido_23)
  contenido_23 <- tolower(contenido_23)
  return(contenido_23)
}

# Aplicar la función de limpieza a cada archivo en la carpeta del año 2023
textos_limpios_23 <- lapply(archivos_en_carpeta_23, limpiar_texto_23)

# Crear un vector llamado "Texto Limpio" para el año 2023
TextoLimpio_23 <- unlist(textos_limpios_23)

library(tm)

# Crear un Corpus con los textos del año 2023
corpus_23 <- Corpus(VectorSource(TextoLimpio_23))

# Crear el DocumentTermMatrix para el año 2023
dtm_23 <- DocumentTermMatrix(corpus_23)

# Lista de palabras específicas no deseadas para el año 2023
palabras_no_deseadas_23 <- c("ser","bien","pues","personas","bueno","creo","siempre","ver","así","ahora","hace","decir","años","solo","entonces","cosas","verdad","gordo","mismo","tan","claro","mejor","muchas","persona","mal","día","menos","voy","gracias","gente","hecho","dice","cómo","sé","mas","vale")

# Eliminar stopwords de nuestro corpus del año 2023
corpus_limpio_23 <- tm_map(corpus_23, removeWords, stopwords("es"))

## Warning in tm_map.SimpleCorpus(corpus_23, removeWords, stopwords("es")):
## transformation drops documents

# Eliminar palabras específicas no deseadas para el año 2023
corpus_limpio_23 <- tm_map(corpus_limpio_23, removeWords, palabras_no_deseadas_23)

## Warning in tm_map.SimpleCorpus(corpus_limpio_23, removeWords,
## palabras_no_deseadas_23): transformation drops documents

# Crear el nuevo DocumentTermMatrix para el año 2023
dtm_limpio_23 <- DocumentTermMatrix(corpus_limpio_23)

# Sumar las columnas para obtener el conteo total de cada término para el año 2023
conteo_total_limpio_23 <- colSums(as.matrix(dtm_limpio_23))

# Ordenar y mostrar los términos más comunes para el año 2023
terminos_comunes_limpio_23 <- sort(conteo_total_limpio_23, decreasing = TRUE)

library(ggplot2)

# Obtener los 15 términos más comunes para el año 2023
top_15_terminos_23 <- head(terminos_comunes_limpio_23, 15)

# Crear un data frame con los términos y sus conteos para el año 2023
data_23 <- data.frame(Término = names(top_15_terminos_23), Conteo = top_15_terminos_23)

# Crear la gráfica de barras sin líneas de fondo para el año 2023
ggplot(data_23, aes(x = reorder(Término, -Conteo), y = Conteo)) +
  geom_bar(stat = "identity", fill = colores) +
  labs(title = "Los 15 términos más comunes en el discurso del 2023", x = NULL, y = "Frecuencia") +
  theme_minimal() +
  theme(panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(), 
        axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold", vjust = -7))

Ahora veremos cuantas veces sale por año la palabra central de nuestro análisis

# Conjunto de palabras que deseas contar
conjunto_palabras <- c("gordofobia", "discriminación", "belleza","mujeres")  # Agrega las palabras que desees

# Mostrar el conteo individual de cada palabra en el conjunto
for (palabra in conjunto_palabras) {
  conteo_palabra <- conteo_total_limpio_23[palabra]
  print(paste("La palabra", palabra, "aparece", conteo_palabra, "veces."))
}

## [1] "La palabra gordofobia aparece 855 veces."
## [1] "La palabra discriminación aparece 289 veces."
## [1] "La palabra belleza aparece 323 veces."
## [1] "La palabra mujeres aparece 883 veces."

Wordcloud

library(wordcloud)
# Crear la nube de palabras
wordcloud(names(terminos_comunes_limpio_23), terminos_comunes_limpio_23, min.freq =5, max.words = 150, random.order = FALSE, rot.per = 0.2, colors = brewer.pal(8, "Dark2"))

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : comentarios could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : misma could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : tampoco could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : grande could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : quiere could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : comida could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : puedes could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : trabajo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : hacen could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : cualquier could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : pueden could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : videos could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : contenido could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : corporal could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : después could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : sociales could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : belleza could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : médico could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : positive could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : sabes could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : existe could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : cierto could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : seguir could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : cambiar could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : madre could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : libro could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : pasado could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : sano could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : realidad could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : hacia could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : quieren could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : discriminación could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : entiendo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : bastante could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : mental could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : respeto could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : dejar could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : simplemente could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : grasa could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : hablando could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : diciendo could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : mensaje could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : alimentación could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(names(terminos_comunes_limpio_23),
## terminos_comunes_limpio_23, : hábitos could not be fit on page. It will not be
## plotted.

Análisis de sentimiento

library(ggplot2)
library(dplyr)
library(tidytext)
library(syuzhet)


corpus_vector_2.23 <- unlist(sapply(corpus_limpio_23 , as.character))

#Análisis de sentimiento con el paquete nrc,asigna etiquetas de sentimiento (por ejemplo, positivo, negativo o neutro) a palabras en un texto basándose en un conjunto de datos predefinido que asocia palabras con emociones o sentimientos específicos. Finalmente, contamos los sentimientos utilizando count().


youtube.sentimientos.23 <- data.frame(text = corpus_vector_2.23) %>%
  unnest_tokens(word, text) %>%
    inner_join(get_sentiment_dictionary("nrc", language = "spanish")) %>%
  inner_join(get_sentiments("nrc")) %>%
  count(sentiment)

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiment_dictionary("nrc", language = "spanish")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 8 of `x` matches multiple rows in `y`.
## ℹ Row 2710 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

## Joining with `by = join_by(word, sentiment)`

print (youtube.sentimientos.23)

##       sentiment    n
## 1         anger  728
## 2  anticipation  371
## 3       disgust 1053
## 4          fear 1177
## 5           joy  289
## 6      negative 2804
## 7      positive 1956
## 8       sadness 1168
## 9      surprise  141
## 10        trust 1078

# Obtener los nombres predeterminados de los sentimientos del diccionario NRC
nombres_sentimientos <- get_sentiment_dictionary("nrc", language = "spanish")
ggplot(youtube.sentimientos.23, aes(x = sentiment, y = n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  labs(title = "Distribución de Sentimientos en el discurso del 2023",
       x = NULL, y = "Frecuencia") +
  scale_fill_brewer(palette = "Set3") +  # Puedes elegir otra paleta de colores
  scale_x_discrete(labels = nombres_sentimientos) +  # Cambiar etiquetas de sentimientos a español
  theme_minimal() +
  theme(panel.grid.major = element_blank(),  # Eliminar líneas de la cuadrícula mayor
        panel.grid.minor = element_blank(),  # Eliminar líneas de la cuadrícula menor
        axis.text.x = element_text(face = "bold"),  # Etiquetas de eje x en negritas
        plot.title = element_text(hjust = 0.5, face = "bold", vjust = -3)) +  # Ajustar título del gráfico
  guides(fill = FALSE)  # Eliminar la leyenda de colores (fill)

Análisis de tópico

Ánalisis de solo un txt

# Importar las librerías necesarias
library(tidyverse)
library(tidytext)
library(dplyr)
library(sentimentr)
library(topicmodels)
library(tm)
library(ggplot2)

# Leer el archivo txt
transcription <- read_file("/Users/ahtzirigarcia/Desktop/RETO/Vídeos de youtube/2023/2023_39.txt")
# Preprocesar el texto
transcription <- gsub("\n", " ", transcription)
transcription <- tolower(transcription)
transcription <- removePunctuation(transcription)
transcription <- removeWords(transcription, stopwords("spanish"))
# Analizar el sentimiento
sentiment <- sentiment_by(transcription)

## Warning: Each time `sentiment_by` is run it has to do sentence boundary disambiguation when a
## raw `character` vector is passed to `text.var`. This may be costly of time and
## memory.  It is highly recommended that the user first runs the raw `character`
## vector through the `get_sentences` function.

# Convertir el texto en un Corpus
corpus <- Corpus(VectorSource(transcription))
# Convertir el texto en un DocumentTermMatrix
dtm <- DocumentTermMatrix(corpus)
# Generar un modelo de topic models
lda <- LDA(dtm, k = 3)
# Obtener los términos más importantes de cada tópico
terms <- tidy(lda, matrix = "beta") %>%
group_by(topic) %>%
top_n(10, wt = beta)

Ánalisis de toda la carpeta

# Importar las librerías necesarias
library(tidyverse)
library(tidytext)
library(dplyr)
library(topicmodels)
library(tm)
library(ggplot2)

# Crear una función para analizar un archivo de texto
analyze_text <- function(file_path) {
  # Leer el archivo txt
  transcription <- read_file(file_path)
  # Preprocesar el texto
  transcription <- gsub("\n", " ", transcription)
  transcription <- tolower(transcription)
  transcription <- removePunctuation(transcription)
  transcription <- removeWords(transcription, stopwords("spanish"))
  # Verificar que el documento no esté vacío
  if (nchar(transcription) > 0) {
    # Convertir el texto en un Corpus
    corpus <- Corpus(VectorSource(transcription))
    # Convertir el texto en un DocumentTermMatrix
    dtm <- DocumentTermMatrix(corpus)
    # Verificar que el DTM contenga al menos un término
    if (length(dtm$dimnames$Terms) > 0) {
      # Generar un modelo de topic models
      lda <- LDA(dtm, k = 5)
      # Obtener los términos más importantes de cada tópico
      terms <- tidy(lda, matrix = "beta") %>%
        group_by(term) %>%
        summarize(frequency = sum(beta))
      return(terms)
    }
  }
  return(NULL) # Si el documento no contiene datos válidos
}

# Ruta de la carpeta que contiene los archivos de texto
carpeta_textos.18 <-  "/Users/ahtzirigarcia/Desktop/RETO/Vídeos de youtube/2023"

# Listar los archivos de texto en la carpeta
archivos_texto <- list.files(path = carpeta_textos.18, pattern = "\\.txt$|\\.csv$", full.names = TRUE)

# Inicializar una lista para almacenar los resultados
resultados <- list()

# Iterar a través de los archivos y analizar cada uno
for (archivo in archivos_texto) {
  resultado <- analyze_text(archivo)
  if (!is.null(resultado)) {
    resultados[[archivo]] <- resultado
  }
}

# Combinar y filtrar los resultados de términos más frecuentes
resultados_combinados <- bind_rows(resultados)
top_terms <- resultados_combinados %>%
  group_by(term)

# Obtener los términos más importantes de cada tópico
terms <- tidy(lda, matrix = "beta")

#Para limpiar palabras del análisis de sentimiento: 

# Lista de términos a eliminar de la visualización
terminos_a_eliminar <- c("    ","personas","así","voy","pues","día","digo")


# Ordenar los términos por beta de mayor a menor y seleccionar los 15 primeros
top_terms <- terms %>%
  filter(!term %in% terminos_a_eliminar) %>%
  arrange(desc(beta)) %>%
  top_n(14)

## Selecting by beta

# Agregar la información del tema a resultados_combinados
resultados_combinados <- resultados_combinados %>%
  right_join(top_terms, by = c("term" = "term"))

## Warning in right_join(., top_terms, by = c(term = "term")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 6772 of `x` matches multiple rows in `y`.
## ℹ Row 5 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

# Visualizar los términos más frecuentes con facet_wrap
ggplot(resultados_combinados, aes(x = reorder(term, beta), y = beta, fill = factor(topic))) +
  geom_col() +
  coord_flip() +
  facet_wrap(~ topic, ncol = 3) +
  labs(title = "Los términos más importantes en cada tópico del 2023",
       x = NULL, y = "Importancia") +
  scale_fill_manual(values = colores) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", vjust = -1)  # Ajustar el título en negritas y posición vertical
  ) +
  guides(fill = FALSE)

Evolución del discurso body positive y gordofobico desde el 2015 al 2023

Ahtziri García

2023-11-09

Se instalan las librerias que se utilizaran durante todo el proceso.

Carga de archivos

Limpieza de archivos

Unimos en un mismo vector la función anterior

Análisis Exploratorio en el que se crea la Matriz de Términos de Documento e identifica y lista los términos más frecuentes en el conjunto de datos.

Se eliminan stopwords y palabras específicas

Se crea la visualización de datos

Carga de archivos

Limpieza de archivos

Unimos en un mismo vector la función anterior

Análisis Exploratorio en el que se crea la Matriz de Términos de Documento e identifica y lista los términos más frecuentes en el conjunto de datos.

Se eliminan stopwords y palabras específicas que no funcionan al análisis

Se crea la visualización de datos

Carga de archivos

Limpieza de archivos

Unimos en un mismo vector la función anterior

Análisis Exploratorio en el que se crea la Matriz de Términos de Documento e identifica y lista los términos más frecuentes en el conjunto de datos.

Se eliminan stopwords y palabras específicas que no funcionan al análisis

Se crea la visualización de datos

Carga de archivos:Muchos de los vídeos de este año que están en la red son emitidos por Brasil

Limpieza de archivos

Unimos en un mismo vector la función anterior

Análisis Exploratorio en el que se crea la Matriz de Términos de Documento e identifica y lista los términos más frecuentes en el conjunto de datos.

Se eliminan stopwords y palabras específicas que no funcionan al análisis

Se crea la visualización de datos

Se crea la visualización de datos

Se crea la visualización de datos