Análisis de sentimientos de documentos publicados por el Instituto Nacional de Pueblos Indígenas

#Cargamos librerías necesarias

library(pdftools) #Permite leer los docuementos en formato pdf

## Warning: package 'pdftools' was built under R version 4.3.2

## Using poppler version 23.08.0

library(tm) #Permite realizar minería de texto

## Warning: package 'tm' was built under R version 4.3.2

## Loading required package: NLP

library(syuzhet)#Librería para hacer el análisis de sentimiento.

## Warning: package 'syuzhet' was built under R version 4.3.2

library(ggplot2) #Permite acer vizualizaciones con la información disponible.

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(topicmodels)#Permite extraer información y categorizar grandes conjuntos de documentos.
library(wordcloud) #Crea un corpus para hacer el análisis de frecuencia de palabras.

## Warning: package 'wordcloud' was built under R version 4.3.2

## Loading required package: RColorBrewer

library(tidytext)#Permite realizar mineo de texto con el uso de herramientas tidy
library(dplyr) #Permite utilizar el operador %>%

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#AÑO 2023

# Especifica la ruta de la carpeta que contiene los archivos PDF
carpeta <- "C:/Users/DELL/Desktop/Métodos cuantitativos/pdf2023"  # Cambia esto con la ruta correcta a tu carpeta

# Lista los archivos PDF en la carpeta
archivos_pdf <- list.files(path = carpeta, pattern = "\\.pdf$", full.names = TRUE)

# Función para limpiar el texto
limpiar_texto <- function(texto) {
  # Elimina etiquetas HTML
  texto <- gsub("<.*?>", " ", texto)
  
  # Elimina números
  texto <- gsub("\\d+", " ", texto)
  
  # Elimina URL (http/https/www)
  texto <- gsub("(http|https)://\\S+|www\\.\\S+", " ", texto)
  
  # Elimina espacios en blanco adicionales y signos de puntuación
  texto <- gsub("\\s+", " ", texto)
  texto <- gsub("[[:punct:]]", " ", texto)
  
  return(texto)
}

# Itera sobre los archivos PDF y extrae el texto de cada uno
textos_limpios <- lapply(archivos_pdf, function(archivo) {
  texto <- tolower(pdf_text(archivo))
  textos_limpios <- limpiar_texto(texto)

})

# Instala y carga las bibliotecas necesarias
library(proxy)

## 
## Attaching package: 'proxy'

## The following objects are masked from 'package:stats':
## 
##     as.dist, dist

## The following object is masked from 'package:base':
## 
##     as.matrix

library(cluster)


# Usa directamente el texto preprocesado
todos_los_textos <- tolower(textos_limpios[[1]])

# Elimina stopwords(palabras que no son relevante en el análisis como los artículos)
todos_los_textos <- removeWords(todos_los_textos, stopwords("spanish"))
palabras_eliminar <- c("coi", "fracci", "nuu", "dos", "representaci", "poblaci", "integraci", "ill", "sabre", "asimismo", "cada", "deberan", "comisi", "distribuci")
todos_los_textos <- gsub(paste(palabras_eliminar, collapse = "|"), "", todos_los_textos, ignore.case = TRUE) 

#Crea un corpus a partir de una colección de textos (todos_los_textos) y luego genera una matriz de términos de documentos (dtm) utilizando DocumentTermMatrix
corpus <- Corpus(VectorSource(todos_los_textos))
dtm <- DocumentTermMatrix(corpus)

#Calcula el conteo total de términos en la matriz de términos de documentos (dtm) y luego ordena estos conteos de manera descendente, almacenándolos en la variable 'terminos_comunes'.
conteo_total <- colSums(as.matrix(dtm))
terminos_comunes <- sort(conteo_total, decreasing = TRUE)

# Data Frame para la tabla de frecuencias 
Frecuencia <- data.frame(termino = names(terminos_comunes), conteo = terminos_comunes)
head(Frecuencia, 25)

##                     termino conteo
## pueblos             pueblos    122
## nacional           nacional     56
## consejo             consejo     53
## mexico               mexico     50
## lndfgenas         lndfgenas     49
## nahua                 nahua     45
## indfgenas         indfgenas     42
## pueblo               pueblo     41
## oaxaca               oaxaca     41
## maya                   maya     36
## afromexicano   afromexicano     34
## veracruz           veracruz     25
## gobiernode       gobiernode     24
## derechos           derechos     23
## afromexicanas afromexicanas     23
## comunidades     comunidades     22
## puebla               puebla     22
## gobierno           gobierno     20
## inpi                   inpi     19
## sur                     sur     18
## nahuatl             nahuatl     18
## desarrollo       desarrollo     17
## regiones           regiones     17
## norte                 norte     17
## indigenas         indigenas     16

#Crea una nube de palabras (wordcloud) utilizando la biblioteca correspondiente, donde se visualizan términos (palabras) y sus frecuencias asociadas desde el dataframe 'Frecuencia'. Se establece un filtro mínimo de frecuencia (min.freq = 5), un límite máximo de palabras a mostrar (max.words = 100), se desactiva el orden aleatorio (random.order = FALSE), se ajusta la rotación de las palabras (rot.per = 0.2), y se asigna una paleta de colores mediante 'brewer.pal(10, "Paired")'
library(wordcloud)
wordcloud(words = Frecuencia$termino,
          freq = Frecuencia$conteo,
          min.freq = 8,
          max.words = 100, random.order = FALSE, rot.per = 0.1, colors = brewer.pal(10,"Paired"))

#Análisis de tópicos
# Definir el número de tópicos
num_topics <- 5

# Generar el modelo LDA
lda_model <- LDA(dtm, k = num_topics)

# Obtener los términos más importantes de cada tópico
terms <- tidy(lda_model, matrix = "beta") %>%
  group_by(topic) %>%
  top_n(5, wt = beta)

# Imprimir los términos más importantes de cada tópico
cat("Términos más importantes de cada tópico 2023:\n")

## Términos más importantes de cada tópico 2023:

print(terms)

## # A tibble: 26 × 3
## # Groups:   topic [5]
##    topic term        beta
##    <int> <chr>      <dbl>
##  1     1 consejo   0.0227
##  2     5 consejo   0.0275
##  3     1 derechos  0.0162
##  4     1 indfgenas 0.0162
##  5     2 indfgenas 0.0165
##  6     1 lndfgenas 0.0205
##  7     5 lndfgenas 0.0314
##  8     4 mexico    0.0217
##  9     1 nacional  0.0227
## 10     2 nacional  0.0120
## # ℹ 16 more rows

# Graficar la distribución de documentos en tópicos
ggplot(terms, aes(x = term, y = beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~topic, scales = "free_y", ncol = 3) +
  labs(title = "Distribución de Tópicos 2023",
       x = "Término",
       y = "Peso Beta") +
  theme_minimal()

# Extra: Tabla de términos y pesos beta
terms_table <- tidy(lda_model, matrix = "beta") %>%
  select(term, beta)

# Visualizar la tabla de términos
cat("\nTabla de términos y pesos beta:\n")

## 
## Tabla de términos y pesos beta:

print(terms_table)

## # A tibble: 7,905 × 2
##    term         beta
##    <chr>       <dbl>
##  1 acuerdo 2.16e-  3
##  2 acuerdo 1.50e-  3
##  3 acuerdo 1.39e-  3
##  4 acuerdo 1.87e-140
##  5 acuerdo 6.87e-  3
##  6 adelfo  1.08e-  3
##  7 adelfo  5.07e-150
##  8 adelfo  2.19e-150
##  9 adelfo  4.57e-153
## 10 adelfo  2.13e-148
## # ℹ 7,895 more rows

#Cargar la biblioteca para análisis de emociones
library(text)

## Warning: package 'text' was built under R version 4.3.2

## [0;34mThis is text (version 1.0).
## [0m[0;32mText is new and still rapidly improving.
##                
## Newer versions may have improved functions and updated defaults to reflect current understandings of the state-of-the-art.
##                Please send us feedback based on your experience.[0m[0;35m
## 
## Please note that defaults has changed in the textEmbed-functions since last version; see help(textEmbed) or www.r-text.org for more details.[0m

# Calcular las puntuaciones de emociones con la biblioteca 'text'
puntuaciones_emociones2023 <- get_nrc_sentiment(todos_los_textos, language = "spanish")

# Traducción manual de nombres de emociones (puedes ajustar según tus necesidades)
sentimientos_espanol2023 <- c("ira", "anticipacion", "disgusto", "miedo", "alegria", "desconfianza", "positivo", "tristeza", "asombro", "confianza")

# Asignar los nombres traducidos al data frame
colnames(puntuaciones_emociones2023) <- sentimientos_espanol2023

# Crear un data frame con los sentimientos y sus valores
sentimientos_df2023 <- data.frame(sentimiento = names(colSums(puntuaciones_emociones2023)), valor = colSums(puntuaciones_emociones2023))

# Ordenar el data frame por frecuencia de sentimientos
sentimientos_df2023 <- sentimientos_df2023 %>% arrange(desc(valor))

# Gráfico de barras para las puntuaciones de emociones
library(ggplot2)
ggplot(sentimientos_df2023, aes(x = reorder(sentimiento, -valor), y = valor, fill = sentimiento)) +
  geom_bar(stat = "identity") +
  labs(title = "Puntuaciones de emociones 2023", x = "Sentimiento", y = "Frecuencia") +
  theme_minimal() + scale_fill_brewer(palette = "Set3") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotar etiquetas del eje x

#Se repite el proceso para cada año de análisis

##AÑO 2022

# Especifica la ruta de la carpeta que contiene los archivos PDF
carpeta2022 <- "C:/Users/DELL/Desktop/Métodos cuantitativos/inpi_pdfs/inpi_2022"  # Cambia esto con la ruta correcta a tu carpeta

# Lista los archivos PDF en la carpeta
archivos_pdf2022 <- list.files(path = carpeta2022, pattern = "\\.pdf$", full.names = TRUE)

# Función para limpiar el texto
limpiar_texto2022 <- function(texto2022) {
  # Elimina etiquetas HTML
  texto2022 <- gsub("<.*?>", " ", texto2022)
  
  # Elimina números
  texto2022 <- gsub("\\d+", " ", texto2022)
  
  # Elimina URL (http/https/www)
  texto2022 <- gsub("(http|https)://\\S+|www\\.\\S+", " ", texto2022)
  
  # Elimina espacios en blanco adicionales y signos de puntuación
  texto2022 <- gsub("\\s+", " ", texto2022)
  texto2022 <- gsub("[[:punct:]]", " ", texto2022)
  # Elimina todo lo que no sea una palabra
  texto2022 <- gsub("\\b[^a-zA-Z]\\b", " ", texto2022)
  
  return(texto2022)
}

# Itera sobre los archivos PDF y extrae el texto de cada uno
textos_limpios2022 <- lapply(archivos_pdf2022, function(archivo2022) {
  texto2022 <- tolower(pdf_text(archivo2022))
  textos_limpios2022 <- limpiar_texto(texto2022)
  
})

# Usa directamente el texto preprocesado
todos_los_textos2022 <- tolower(textos_limpios2022[[1]])

# Elimina stopwords
todos_los_textos2022 <- removeWords(todos_los_textos2022, stopwords("spanish"))
palabras_eliminar2022 <- c("", "puede", "debe", "deberá", "cuales", "pueden", "cualquier", "respecto", "todas", "indre", "través", "dos")
todos_los_textos2022 <- gsub(paste(palabras_eliminar2022, collapse = "|"), "", todos_los_textos2022, ignore.case = TRUE) 


corpus2022 <- Corpus(VectorSource(todos_los_textos2022))
dtm2022 <- DocumentTermMatrix(corpus2022)


conteo_total2022 <- colSums(as.matrix(dtm2022))
terminos_comunes2022 <- sort(conteo_total2022, decreasing = TRUE)

# Data Frame para la tabla de frecuencias 
Frecuencia2022 <- data.frame(termino2022 = names(terminos_comunes2022), conteo2022 = terminos_comunes2022)
head(Frecuencia2022, 25)

##                   termino2022 conteo2022
## salud                   salud         24
## viruela               viruela         23
## caso                     caso         23
## casos                   casos         17
## símica                 símica         17
## contacto             contacto         15
## epidemiológica epidemiológica         11
## lesiones             lesiones         11
## pública               pública          9
## días                     días          8
## muestras             muestras          8
## organización     organización          8
## personal             personal          8
## disponible         disponible          7
## países                 países          7
## vigilancia         vigilancia          7
## confirmado         confirmado          7
## contactos           contactos          7
## probable             probable          7
## riesgo                 riesgo          7
## diagnóstico       diagnóstico          6
## enfermedad         enfermedad          6
## julio                   julio          6
## mundial               mundial          6
## oms                       oms          6

wordcloud(words = Frecuencia2022$termino2022,
          freq = Frecuencia2022$conteo2022,
          min.freq = 2,
          max.words = 100, random.order = FALSE, rot.per = 0.2, colors = brewer.pal(10,"Paired"),
          family="Arial")

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in wordcloud(words = Frecuencia2022$termino2022, freq =
## Frecuencia2022$conteo2022, : jurisdicción could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(words = Frecuencia2022$termino2022, freq =
## Frecuencia2022$conteo2022, : polipropileno could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(words = Frecuencia2022$termino2022, freq =
## Frecuencia2022$conteo2022, : realizar could not be fit on page. It will not be
## plotted.

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in wordcloud(words = Frecuencia2022$termino2022, freq =
## Frecuencia2022$conteo2022, : padecimiento could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(words = Frecuencia2022$termino2022, freq =
## Frecuencia2022$conteo2022, : podría could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(words = Frecuencia2022$termino2022, freq =
## Frecuencia2022$conteo2022, : prevención could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(words = Frecuencia2022$termino2022, freq =
## Frecuencia2022$conteo2022, : dirección could not be fit on page. It will not be
## plotted.

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

# Definir el número de tópicos
num_topics <- 5

# Generar el modelo LDA
lda_model2022 <- LDA(dtm2022, k = num_topics)

# Obtener los términos más importantes de cada tópico
terms2022 <- tidy(lda_model2022, matrix = "beta") %>%
  group_by(topic) %>%
  top_n(5, wt = beta)

# Imprimir los términos más importantes de cada tópico
cat("Términos más importantes de cada tópico:\n")

## Términos más importantes de cada tópico:

print(terms2022)

## # A tibble: 30 × 3
## # Groups:   topic [5]
##    topic term             beta
##    <int> <chr>           <dbl>
##  1     3 casos          0.0191
##  2     4 casos          0.0161
##  3     5 disponible     0.0455
##  4     3 enfermedad     0.0127
##  5     1 epidemiológica 0.0144
##  6     2 lesiones       0.0193
##  7     2 muestras       0.0270
##  8     5 mundial        0.0364
##  9     5 organización   0.0455
## 10     1 salud          0.0144
## # ℹ 20 more rows

# Graficar la distribución de documentos en tópicos
ggplot(terms2022, aes(x = term, y = beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~topic, scales = "free_y", ncol = 3) +
  labs(title = "Distribución de Términos en Tópicos 2022",
       x = "Término",
       y = "Peso Beta") +
  theme_minimal()

# Extra: Tabla de términos y pesos beta
terms_table2022 <- tidy(lda_model2022, matrix = "beta") %>%
  select(term, beta)

# Visualizar la tabla de términos
cat("\nTabla de términos y pesos beta:\n")

## 
## Tabla de términos y pesos beta:

print(terms_table2022)

## # A tibble: 3,665 × 2
##    term          beta
##    <chr>        <dbl>
##  1 acumulado 7.25e-89
##  2 acumulado 3.06e-86
##  3 acumulado 2.12e- 3
##  4 acumulado 4.53e-89
##  5 acumulado 7.04e-73
##  6 afecta    2.40e-89
##  7 afecta    1.03e-86
##  8 afecta    2.12e- 3
##  9 afecta    7.00e-89
## 10 afecta    6.44e-74
## # ℹ 3,655 more rows

# Calcular las puntuaciones de emociones con la biblioteca 'text'
puntuaciones_emociones2022 <- get_nrc_sentiment(todos_los_textos2022 , language = "spanish")

# Traducción manual de nombres de emociones (puedes ajustar según tus necesidades)
sentimientos_espanol2022  <- c("ira", "anticipacion", "disgusto", "miedo", "alegria", "desconfianza", "positivo", "tristeza", "asombro", "confianza")

# Asignar los nombres traducidos al data frame
colnames(puntuaciones_emociones2022) <- sentimientos_espanol2022 

# Crear un data frame con los sentimientos y sus valores
sentimientos_df2022 <- data.frame(sentimiento = names(colSums(puntuaciones_emociones2022)), valor = colSums(puntuaciones_emociones2022))

# Ordenar el data frame por frecuencia de sentimientos
sentimientos_df2022 <- sentimientos_df2022 %>% arrange(desc(valor))

# Gráfico de barras para las puntuaciones de emociones
library(ggplot2)
ggplot(sentimientos_df2022, aes(x = reorder(sentimiento, -valor), y = valor, fill = sentimiento)) +
  geom_bar(stat = "identity") +
  labs(title = "Puntuaciones de emociones 2022", x = "Sentimiento", y = "Frecuencia") +
  theme_minimal() + scale_fill_brewer(palette = "Set3") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotar etiquetas del eje x

#AÑO 2021

# Especifica la ruta de la carpeta que contiene los archivos PDF
carpeta2021 <- "C:/Users/DELL/Desktop/Métodos cuantitativos/inpi_pdfs/inpi_2021"  # Cambia esto con la ruta correcta a tu carpeta

# Lista los archivos PDF en la carpeta
archivos_pdf2021 <- list.files(path = carpeta2021, pattern = "\\.pdf$", full.names = TRUE)

# Función para limpiar el texto
limpiar_texto2021 <- function(texto2021) {
  # Elimina etiquetas HTML
  texto2021 <- gsub("<.*?>", " ", texto2021)
  
  # Elimina números
  texto2021 <- gsub("\\d+", " ", texto2021)
  
  # Elimina URL (http/https/www)
  texto2021 <- gsub("(http|https)://\\S+|www\\.\\S+", " ", texto2021)
  
  # Elimina espacios en blanco adicionales y signos de puntuación
  texto2021 <- gsub("\\s+", " ", texto2021)
  texto2021 <- gsub("[[:punct:]]", " ", texto2021)
  
  return(texto2022)
}

# Itera sobre los archivos PDF y extrae el texto de cada uno
textos_limpios2021 <- lapply(archivos_pdf2021, function(archivo2021) {
  texto2021 <- tolower(pdf_text(archivo2021))
  textos_limpios2021 <- limpiar_texto(texto2021)
  return(textos_limpios2021)
})

## PDF error: Invalid Font Weight

# Usa directamente el texto preprocesado
todos_los_textos2021 <- tolower(textos_limpios2021[[1]])

# Elimina stopwords
todos_los_textos2021 <- removeWords(todos_los_textos2021, stopwords("spanish"))
palabras_eliminar2021 <- c("for", "hrc", "rev","haut", "puede")
todos_los_textos2021 <- gsub(paste(palabras_eliminar2021, collapse = "|"), "", todos_los_textos2021, ignore.case = TRUE) 


corpus2021 <- Corpus(VectorSource(todos_los_textos2021))
dtm2021 <- DocumentTermMatrix(corpus2021)


conteo_total2021 <- colSums(as.matrix(dtm2021))
terminos_comunes2021 <- sort(conteo_total2021, decreasing = TRUE)

# Data Frame para la tabla de frecuencias 
Frecuencia2021 <- data.frame(termino2021 = names(terminos_comunes2021), conteo2021 = terminos_comunes2021)
head(Frecuencia2021, 25)

##               termino2021 conteo2021
## derechos         derechos         20
## humanos           humanos         17
## consejo           consejo         13
## resolución     resolución         10
## miembro           miembro          6
## especial         especial          4
## expertos         expertos          4
## relator           relator          4
## asia                 asia          3
## candidaturas candidaturas          3
## central           central          3
## grupo               grupo          3
## indígenas       indígenas          3
## mecanismo       mecanismo          3
## pacífico         pacífico          3
## pueblos           pueblos          3
## situación       situación          3
## trabajo           trabajo          3
## candidatura   candidatura          2
## empresas         empresas          2
## europa             europa          2
## inmal               inmal          2
## mandatos         mandatos          2
## nombrados       nombrados          2
## ohchr               ohchr          2

wordcloud(words = Frecuencia2021$termino2021,
          freq = Frecuencia2021$conteo2021,
          min.freq = 2,
          max.words = 100, random.order = FALSE, rot.per = 0.2, colors = brewer.pal(10,"Paired"),
          family="Arial")

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

# Definir el número de tópicos
num_topics <- 5

# Generar el modelo LDA
lda_model2021 <- LDA(dtm2021, k = num_topics)

# Obtener los términos más importantes de cada tópico
terms2021 <- tidy(lda_model2021, matrix = "beta") %>%
  group_by(topic) %>%
  top_n(5, wt = beta)

# Imprimir los términos más importantes de cada tópico
cat("Términos más importantes de cada tópico:\n")

## Términos más importantes de cada tópico:

print(terms2021)

## # A tibble: 25 × 3
## # Groups:   topic [5]
##    topic term       beta
##    <int> <chr>     <dbl>
##  1     1 consejo  0.0674
##  2     2 consejo  0.0580
##  3     1 derechos 0.0581
##  4     2 derechos 0.137 
##  5     1 humanos  0.0945
##  6     2 humanos  0.0691
##  7     4 inmal    0.189 
##  8     2 miembro  0.0278
##  9     4 ohchr    0.164 
## 10     5 ohchr    0.0659
## # ℹ 15 more rows

# Graficar la distribución de documentos en tópicos
ggplot(terms2021, aes(x = term, y = beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~topic, scales = "free_y", ncol = 3) +
  labs(title = "Distribución de Términos en Tópicos 2021",
       x = "Término",
       y = "Peso Beta") +
  theme_minimal()

# Extra: Tabla de términos y pesos beta
terms_table2021<- tidy(lda_model2021, matrix = "beta") %>%
  select(term, beta)

# Visualizar la tabla de términos
cat("\nTabla de términos y pesos beta:\n")

## 
## Tabla de términos y pesos beta:

print(terms_table2021)

## # A tibble: 640 × 2
##    term            beta
##    <chr>          <dbl>
##  1 acepta      5.66e- 3
##  2 acepta      3.96e- 3
##  3 acepta      7.07e-36
##  4 acepta      1.26e- 3
##  5 acepta      3.40e-35
##  6 acompañadas 5.43e- 3
##  7 acompañadas 4.20e- 3
##  8 acompañadas 2.85e-36
##  9 acompañadas 8.25e- 4
## 10 acompañadas 5.72e-36
## # ℹ 630 more rows

# Calcular las puntuaciones de emociones con la biblioteca 'text'
puntuaciones_emociones2021 <- get_nrc_sentiment(todos_los_textos2021 , language = "spanish")

# Traducción manual de nombres de emociones (puedes ajustar según tus necesidades)
sentimientos_espanol2021  <- c("ira", "anticipacion", "disgusto", "miedo", "alegria", "desconfianza", "positivo", "tristeza", "asombro", "confianza")

# Asignar los nombres traducidos al data frame
colnames(puntuaciones_emociones2021) <- sentimientos_espanol2021 

# Crear un data frame con los sentimientos y sus valores
sentimientos_df2021 <- data.frame(sentimiento = names(colSums(puntuaciones_emociones2021)), valor = colSums(puntuaciones_emociones2021))

# Ordenar el data frame por frecuencia de sentimientos
sentimientos_df2021 <- sentimientos_df2021 %>% arrange(desc(valor))

# Gráfico de barras para las puntuaciones de emociones
library(ggplot2)
ggplot(sentimientos_df2021, aes(x = reorder(sentimiento, -valor), y = valor, fill = sentimiento)) +
  geom_bar(stat = "identity") +
  labs(title = "Puntuaciones de emociones 2021", x = "Sentimiento", y = "Frecuencia") +
  theme_minimal() + scale_fill_brewer(palette = "Set3") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotar etiquetas del eje x

#AÑO 2020

##AÑO 2020

library(pdftools)
library(tm)

# Especifica la ruta de la carpeta que contiene los archivos PDF
carpeta2020 <- "C:/Users/DELL/Desktop/Métodos cuantitativos/inpi_pdfs/inpi_2020"  # Cambia esto con la ruta correcta a tu carpeta

# Lista los archivos PDF en la carpeta
archivos_pdf2020 <- list.files(path = carpeta2020, pattern = "\\.pdf$", full.names = TRUE)

# Función para limpiar el texto
limpiar_texto2020 <- function(texto2020) {
  # Elimina etiquetas HTML
  texto2020 <- gsub("<.*?>", " ", texto2020)
  
  # Elimina números
  texto2020 <- gsub("\\d+", " ", texto2020)
  
  # Elimina URL (http/https/www)
  texto2020 <- gsub("(http|https)://\\S+|www\\.\\S+", " ", texto2020)
  
  # Elimina espacios en blanco adicionales y signos de puntuación
  texto2020 <- gsub("\\s+", " ", texto2020)
  texto2020 <- gsub("[[:punct:]]", " ", texto2020)
  
  return(texto2020)
}

# Itera sobre los archivos PDF y extrae el texto de cada uno
textos_limpios2020 <- lapply(archivos_pdf2020, function(archivo2020) {
  texto2020 <- tolower(pdf_text(archivo2020))
  textos_limpios2020 <- limpiar_texto(texto2020)
  return(textos_limpios2020)
})

## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight

## PDF error: Invalid 'begincidrange' operator in ToUnicode CMap
## PDF error: Invalid 'begincidrange' operator in ToUnicode CMap
## PDF error: Invalid 'begincidrange' operator in ToUnicode CMap

# Usa directamente el texto preprocesado
todos_los_textos2020 <- tolower(textos_limpios2020[[1]])

# Elimina stopwords
todos_los_textos2020 <- removeWords(todos_los_textos2020, stopwords("spanish"))
palabras_eliminar2020 <- c("mas", "más", "así","for")
todos_los_textos2020 <- gsub(paste(palabras_eliminar2020, collapse = "|"), "", todos_los_textos2020, ignore.case = TRUE) 


corpus2020 <- Corpus(VectorSource(todos_los_textos2020))
dtm2020 <- DocumentTermMatrix(corpus2020)


conteo_total2020 <- colSums(as.matrix(dtm2020))
terminos_comunes2020 <- sort(conteo_total2020, decreasing = TRUE)

# Data Frame para la tabla de frecuencias 
Frecuencia2020 <- data.frame(termino2020 = names(terminos_comunes2020), conteo2020 = terminos_comunes2020)
head(Frecuencia2020, 25)

##                 termino2020 conteo2020
## indígenas         indígenas        395
## pueblos             pueblos        286
## derechos           derechos        176
## comunidades     comunidades        157
## afromexicanas afromexicanas        139
## indígena           indígena        130
## desarrollo       desarrollo        103
## base                   base         89
## valor                 valor         83
## nacional           nacional         78
## cultural           cultural         76
## población         población         76
## variable           variable         75
## derecho             derecho         74
## bienestar         bienestar         71
## meta                   meta         68
## afromexicano   afromexicano         65
## línea                 línea         65
## acciones           acciones         62
## inpi                   inpi         59
## integral           integral         56
## consulta           consulta         55
## inmación           inmación         54
## objetivo           objetivo         50
## parámetro         parámetro         47

wordcloud(words = Frecuencia2020$termino2020,
          freq = Frecuencia2020$conteo2020,
          min.freq = 1,
          max.words = 100, random.order = FALSE, rot.per = 0.1, colors = brewer.pal(10,"Paired"),
          family="Arial")

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

# Definir el número de tópicos
num_topics <- 5

# Generar el modelo LDA
lda_model2020 <- LDA(dtm2020, k = num_topics)

# Obtener los términos más importantes de cada tópico
terms2020 <- tidy(lda_model2020, matrix = "beta") %>%
  group_by(topic) %>%
  top_n(5, wt = beta)

# Imprimir los términos más importantes de cada tópico
cat("Términos más importantes de cada tópico:\n")

## Términos más importantes de cada tópico:

print(terms2020)

## # A tibble: 25 × 3
## # Groups:   topic [5]
##    topic term         beta
##    <int> <chr>       <dbl>
##  1     2 derecho    0.0181
##  2     2 derechos   0.0266
##  3     5 derechos   0.0229
##  4     3 desarrollo 0.0213
##  5     1 indígenas  0.0252
##  6     2 indígenas  0.0371
##  7     3 indígenas  0.0400
##  8     5 indígenas  0.0557
##  9     1 nacional   0.0111
## 10     1 pueblos    0.0210
## # ℹ 15 more rows

# Graficar la distribución de documentos en tópicos
ggplot(terms2020, aes(x = term, y = beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~topic, scales = "free_y", ncol = 3) +
  labs(title = "Distribución de Términos en Tópicos 2020",
       x = "Término",
       y = "Peso Beta") +
  theme_minimal()

# Extra: Tabla de términos y pesos beta
terms_table2020<- tidy(lda_model2020, matrix = "beta") %>%
  select(term, beta)

# Visualizar la tabla de términos
cat("\nTabla de términos y pesos beta:\n")

## 
## Tabla de términos y pesos beta:

print(terms_table2020)

## # A tibble: 12,450 × 2
##    term           beta
##    <chr>         <dbl>
##  1 acciones  2.25e-  3
##  2 acciones  2.27e-  3
##  3 acciones  1.12e-  2
##  4 acciones  4.48e-  3
##  5 acciones  7.63e-  3
##  6 acrónimos 7.97e-  4
##  7 acrónimos 1.20e- 91
##  8 acrónimos 1.33e-103
##  9 acrónimos 2.34e-107
## 10 acrónimos 2.56e-104
## # ℹ 12,440 more rows

# Calcular las puntuaciones de emociones con la biblioteca 'text'
puntuaciones_emociones2020 <- get_nrc_sentiment(todos_los_textos2020, language = "spanish")

# Traducción manual de nombres de emociones (puedes ajustar según tus necesidades)
sentimientos_espanol2020 <- c("ira", "anticipacion", "disgusto", "miedo", "alegria", "desconfianza", "positivo", "tristeza", "asombro", "confianza")

# Asignar los nombres traducidos al data frame
colnames(puntuaciones_emociones2020) <- sentimientos_espanol2020

# Crear un data frame con los sentimientos y sus valores
sentimientos_df2020 <- data.frame(sentimiento = names(colSums(puntuaciones_emociones2020)), valor = colSums(puntuaciones_emociones2020))

# Ordenar el data frame por frecuencia de sentimientos
sentimientos_df2020 <- sentimientos_df2020 %>% arrange(desc(valor))

# Gráfico de barras para las puntuaciones de emociones
library(ggplot2)
ggplot(sentimientos_df2020, aes(x = reorder(sentimiento, -valor), y = valor, fill = sentimiento)) +
  geom_bar(stat = "identity") +
  labs(title = "Puntuaciones de emociones 2020", x = "Sentimiento", y = "Frecuencia") +
  theme_minimal() + scale_fill_brewer(palette = "Set3") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotar etiquetas del eje x

#AÑO 2019

# Especifica la ruta de la carpeta que contiene los archivos PDF
carpeta2019 <- "C:/Users/DELL/Desktop/Métodos cuantitativos/inpi_pdfs/inpi_2019"  # Cambia esto con la ruta correcta a tu carpeta

# Lista los archivos PDF en la carpeta
archivos_pdf2019 <- list.files(path = carpeta2019, pattern = "\\.pdf$", full.names = TRUE)

# Función para limpiar el texto
limpiar_texto2019 <- function(texto2019) {
  # Elimina etiquetas HTML
  texto2019 <- gsub("<.*?>", " ", texto2019)
  
  # Elimina números
  texto2019 <- gsub("\\d+", " ", texto2019)
  
  # Elimina URL (http/https/www)
  texto2019 <- gsub("(http|https)://\\S+|www\\.\\S+", " ", texto2019)
  
  # Elimina espacios en blanco adicionales y signos de puntuación
  texto2019 <- gsub("\\s+", " ", texto2019)
  texto2019 <- gsub("[[:punct:]]", " ", texto2019)
  
  return(texto2019)
}

# Itera sobre los archivos PDF y extrae el texto de cada uno
textos_limpios2019 <- lapply(archivos_pdf2019, function(archivo2019) {
  texto2019 <- tolower(pdf_text(archivo2019))
  textos_limpios2019 <- limpiar_texto(texto2019)
  return(textos_limpios2019)
})

## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight

# Usa directamente el texto preprocesado
todos_los_textos2019 <- tolower(textos_limpios2019[[1]])

# Elimina stopwords
todos_los_textos2019 <- removeWords(todos_los_textos2019, stopwords("spanish"))
palabras_eliminar2019 <- c("for", "hrc", "así", "mas", "más")
todos_los_textos2019 <- gsub(paste(palabras_eliminar2019, collapse = "|"), "", todos_los_textos2019, ignore.case = TRUE) 


corpus2019 <- Corpus(VectorSource(todos_los_textos2019))
dtm2019 <- DocumentTermMatrix(corpus2019)


conteo_total2019 <- colSums(as.matrix(dtm2019))
terminos_comunes2019 <- sort(conteo_total2019, decreasing = TRUE)

# Data Frame para la tabla de frecuencias 
Frecuencia2019 <- data.frame(termino2019 = names(terminos_comunes2019), conteo2019 = terminos_comunes2019)
head(Frecuencia2019, 25)

##                     termino2019 conteo2019
## indígenas             indígenas        165
## pueblos                 pueblos        162
## instituto             instituto        130
## general                 general        106
## acciones               acciones         92
## coordinación       coordinación         74
## entidades             entidades         62
## coordinar             coordinar         62
## afromexicano       afromexicano         60
## administración   administración         54
## desarrollo           desarrollo         52
## progra                   progra         52
## pública                 pública         51
## administrativas administrativas         50
## federal                 federal         49
## derechos               derechos         49
## director               director         49
## unidades               unidades         47
## gobierno               gobierno         46
## nacional               nacional         45
## disposiciones     disposiciones         44
## comunidades         comunidades         43
## artículo               artículo         41
## afromexicanas     afromexicanas         39
## junta                     junta         38

wordcloud(words = Frecuencia2019$termino2019,
          freq = Frecuencia2019$conteo2019,
          min.freq = 2,
          max.words = 100, random.order = FALSE, rot.per = 0.2, colors = brewer.pal(10,"Paired"),
          family="Arial")

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in wordcloud(words = Frecuencia2019$termino2019, freq =
## Frecuencia2019$conteo2019, : municipios could not be fit on page. It will not
## be plotted.

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in wordcloud(words = Frecuencia2019$termino2019, freq =
## Frecuencia2019$conteo2019, : promover could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(words = Frecuencia2019$termino2019, freq =
## Frecuencia2019$conteo2019, : aplicables could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(words = Frecuencia2019$termino2019, freq =
## Frecuencia2019$conteo2019, : transversalidad could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(words = Frecuencia2019$termino2019, freq =
## Frecuencia2019$conteo2019, : intercultural could not be fit on page. It will
## not be plotted.

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in wordcloud(words = Frecuencia2019$termino2019, freq =
## Frecuencia2019$conteo2019, : procesos could not be fit on page. It will not be
## plotted.

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in wordcloud(words = Frecuencia2019$termino2019, freq =
## Frecuencia2019$conteo2019, : estatuto could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(words = Frecuencia2019$termino2019, freq =
## Frecuencia2019$conteo2019, : participación could not be fit on page. It will
## not be plotted.

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in wordcloud(words = Frecuencia2019$termino2019, freq =
## Frecuencia2019$conteo2019, : colaboración could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(words = Frecuencia2019$termino2019, freq =
## Frecuencia2019$conteo2019, : correspondan could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(words = Frecuencia2019$termino2019, freq =
## Frecuencia2019$conteo2019, : indígena could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(words = Frecuencia2019$termino2019, freq =
## Frecuencia2019$conteo2019, : elaboración could not be fit on page. It will not
## be plotted.

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in wordcloud(words = Frecuencia2019$termino2019, freq =
## Frecuencia2019$conteo2019, : coordinadores could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(words = Frecuencia2019$termino2019, freq =
## Frecuencia2019$conteo2019, : principios could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(words = Frecuencia2019$termino2019, freq =
## Frecuencia2019$conteo2019, : protección could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(words = Frecuencia2019$termino2019, freq =
## Frecuencia2019$conteo2019, : celebrar could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(words = Frecuencia2019$termino2019, freq =
## Frecuencia2019$conteo2019, : necesarios could not be fit on page. It will not
## be plotted.

# Definir el número de tópicos
num_topics <- 5

# Generar el modelo LDA
lda_model2019 <- LDA(dtm2019, k = num_topics)

# Obtener los términos más importantes de cada tópico
terms2019 <- tidy(lda_model2019, matrix = "beta") %>%
  group_by(topic) %>%
  top_n(5, wt = beta)

# Imprimir los términos más importantes de cada tópico
cat("Términos más importantes de cada tópico:\n")

## Términos más importantes de cada tópico:

print(terms2019)

## # A tibble: 26 × 3
## # Groups:   topic [5]
##    topic term              beta
##    <int> <chr>            <dbl>
##  1     4 administrativas 0.0146
##  2     5 administrativas 0.0137
##  3     1 entidades       0.0152
##  4     1 federal         0.0147
##  5     1 gobierno        0.0171
##  6     1 indígenas       0.0242
##  7     2 indígenas       0.0442
##  8     3 indígenas       0.0336
##  9     2 instituto       0.0442
## 10     4 instituto       0.0291
## # ℹ 16 more rows

# Graficar la distribución de documentos en tópicos
ggplot(terms2019, aes(x = term, y = beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~topic, scales = "free_y", ncol = 3) +
  labs(title = "Distribución de Términos en Tópicos 2019",
       x = "Término",
       y = "Peso Beta") +
  theme_minimal()

# Extra: Tabla de términos y pesos beta
terms_table2019<- tidy(lda_model2019, matrix = "beta") %>%
  select(term, beta)

# Visualizar la tabla de términos
cat("\nTabla de términos y pesos beta:\n")

## 
## Tabla de términos y pesos beta:

print(terms_table2019)

## # A tibble: 6,930 × 2
##    term        beta
##    <chr>      <dbl>
##  1 acción 6.78e-  4
##  2 acción 1.20e-  3
##  3 acción 1.68e- 91
##  4 acción 5.28e- 89
##  5 acción 1.24e-  7
##  6 actual 1.16e- 99
##  7 actual 1.20e-  3
##  8 actual 3.07e-100
##  9 actual 1.24e- 96
## 10 actual 2.66e-100
## # ℹ 6,920 more rows

# Calcular las puntuaciones de emociones con la biblioteca 'text'
puntuaciones_emociones2019 <- get_nrc_sentiment(todos_los_textos2019, language = "spanish")

# Traducción manual de nombres de emociones (puedes ajustar según tus necesidades)
sentimientos_espanol2019 <- c("ira", "anticipacion", "disgusto", "miedo", "alegria", "desconfianza", "positivo", "tristeza", "asombro", "confianza")

# Asignar los nombres traducidos al data frame
colnames(puntuaciones_emociones2019) <- sentimientos_espanol2019

# Crear un data frame con los sentimientos y sus valores
sentimientos_df2019 <- data.frame(sentimiento = names(colSums(puntuaciones_emociones2019)), valor = colSums(puntuaciones_emociones2019))

# Ordenar el data frame por frecuencia de sentimientos
sentimientos_df2019 <- sentimientos_df2019 %>% arrange(desc(valor))

# Gráfico de barras para las puntuaciones de emociones
library(ggplot2)
ggplot(sentimientos_df2019, aes(x = reorder(sentimiento, -valor), y = valor, fill = sentimiento)) +
  geom_bar(stat = "identity") +
  labs(title = "Puntuaciones de emociones 2019", x = "Sentimiento", y = "Frecuencia") +
  theme_minimal() + scale_fill_brewer(palette = "Set3") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotar etiquetas del eje x

#AÑO 2018

# Especifica la ruta de la carpeta que contiene los archivos PDF
carpeta2018 <- "C:/Users/DELL/Desktop/Métodos cuantitativos/inpi_pdfs/inpi_2018"  # Cambia esto con la ruta correcta a tu carpeta

# Lista los archivos PDF en la carpeta
archivos_pdf2018 <- list.files(path = carpeta2018, pattern = "\\.pdf$", full.names = TRUE)

# Función para limpiar el texto
limpiar_texto2018 <- function(texto2018) {
  # Elimina etiquetas HTML
  texto2018 <- gsub("<.*?>", " ", texto2018)
  
  # Elimina números
  texto2018 <- gsub("\\d+", " ", texto2018)
  
  # Elimina URL (http/https/www)
  texto2018 <- gsub("(http|https)://\\S+|www\\.\\S+", " ", texto2018)
  
  # Elimina espacios en blanco adicionales y signos de puntuación
  texto2018 <- gsub("\\s+", " ", texto2018)
  texto2018 <- gsub("[[:punct:]]", " ", texto2018)
  
  return(texto2018)
}

# Itera sobre los archivos PDF y extrae el texto de cada uno
textos_limpios2018 <- lapply(archivos_pdf2018, function(archivo2018) {
  texto2018 <- tolower(pdf_text(archivo2018))
  textos_limpios2018 <- limpiar_texto(texto2018)
  return(textos_limpios2018)
})

# Usa directamente el texto preprocesado
todos_los_textos2018 <- tolower(textos_limpios2018[[1]])

# Elimina stopwords
todos_los_textos2018 <- removeWords(todos_los_textos2018, stopwords("spanish"))
palabras_eliminar2018 <- c("cada","parte", "mismas", "deberán", "conforme","cualquier", "partir", "podrá", "podrán", "través","adjunta","así", "deberá")
todos_los_textos2018 <- gsub(paste(palabras_eliminar2018, collapse = "|"), "", todos_los_textos2018, ignore.case = TRUE) 


corpus2018 <- Corpus(VectorSource(todos_los_textos2018))
dtm2018 <- DocumentTermMatrix(corpus2018)


conteo_total2018 <- colSums(as.matrix(dtm2018))
terminos_comunes2018 <- sort(conteo_total2018, decreasing = TRUE)

# Data Frame para la tabla de frecuencias 
Frecuencia2018 <- data.frame(termino2018 = names(terminos_comunes2018), conteo2018 = terminos_comunes2018)
head(Frecuencia2018, 25)

##                     termino2018 conteo2018
## obras                     obras        230
## programa               programa        191
## cdi                         cdi        122
## recursos               recursos        109
## federales             federales         85
## coordinación       coordinación         82
## infraestructura infraestructura         81
## ejecución             ejecución         81
## entidades             entidades         80
## estatal                 estatal         73
## obra                       obra         72
## acuerdo                 acuerdo         71
## gobierno               gobierno         68
## operación             operación         67
## localidades         localidades         66
## social                   social         65
## caso                       caso         65
## reglas                   reglas         64
## dependencias       dependencias         63
## general                 general         62
## dependencia         dependencia         61
## oficial                 oficial         59
## desarrollo           desarrollo         58
## diario                   diario         57
## ley                         ley         53

wordcloud(words = Frecuencia2018$termino2018,
          freq = Frecuencia2018$conteo2018,
          min.freq = 2,
          max.words = 100, random.order = FALSE, rot.per = 0.2, colors = brewer.pal(10,"Paired"),
          family="Arial")

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in text.default(x1, y1, words[i], cex = size[i], offset = 0, srt =
## rotWord * : font family not found in Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

## Warning in strwidth(words[i], cex = size[i], ...): font family not found in
## Windows font database

# Definir el número de tópicos
num_topics <- 5

# Generar el modelo LDA
lda_model2018 <- LDA(dtm2018, k = num_topics)

# Obtener los términos más importantes de cada tópico
terms2018 <- tidy(lda_model2018, matrix = "beta") %>%
  group_by(topic) %>%
  top_n(5, wt = beta)

# Imprimir los términos más importantes de cada tópico
cat("Términos más importantes de cada tópico:\n")

## Términos más importantes de cada tópico:

print(terms2018)

## # A tibble: 25 × 3
## # Groups:   topic [5]
##    topic term         beta
##    <int> <chr>       <dbl>
##  1     4 desarrollo 0.0169
##  2     2 entidades  0.0170
##  3     2 programa   0.0257
##  4     3 programa   0.0162
##  5     4 programa   0.0242
##  6     5 programa   0.0150
##  7     4 pueblos    0.0112
##  8     1 recursos   0.0114
##  9     3 recursos   0.0114
## 10     5 recursos   0.0196
## # ℹ 15 more rows

# Graficar la distribución de documentos en tópicos
ggplot(terms2018, aes(x = term, y = beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~topic, scales = "free_y", ncol = 3) +
  labs(title = "Distribución de Términos en Tópicos 2018",
       x = "Término",
       y = "Peso Beta") +
  theme_minimal()

# Extra: Tabla de términos y pesos beta
terms_table2018<- tidy(lda_model2018, matrix = "beta") %>%
  select(term, beta)

# Visualizar la tabla de términos
cat("\nTabla de términos y pesos beta:\n")

## 
## Tabla de términos y pesos beta:

print(terms_table2018)

## # A tibble: 11,120 × 2
##    term          beta
##    <chr>        <dbl>
##  1 acciones 4.04e-  3
##  2 acciones 5.45e-  3
##  3 acciones 5.40e-  3
##  4 acciones 4.36e-  3
##  5 acciones 3.10e-  3
##  6 acción   1.52e-284
##  7 acción   8.97e-280
##  8 acción   5.33e-  4
##  9 acción   4.13e-  4
## 10 acción   2.32e-279
## # ℹ 11,110 more rows

# Cargar bibliotecas
# Instalar y cargar la biblioteca para análisis de emociones
library(text)
library(dplyr)

# Calcular las puntuaciones de emociones con la biblioteca 'text'
puntuaciones_emociones2018 <- get_nrc_sentiment(todos_los_textos2018, language = "spanish")

# Traducción manual de nombres de emociones (puedes ajustar según tus necesidades)
sentimientos_espanol2018 <- c("ira", "anticipacion", "disgusto", "miedo", "alegria", "desconfianza", "positivo", "tristeza", "asombro", "confianza")

# Asignar los nombres traducidos al data frame
colnames(puntuaciones_emociones2018) <- sentimientos_espanol2018

# Crear un data frame con los sentimientos y sus valores
sentimientos_df2018 <- data.frame(sentimiento = names(colSums(puntuaciones_emociones2018)), valor = colSums(puntuaciones_emociones2018))

# Ordenar el data frame por frecuencia de sentimientos
sentimientos_df2018 <- sentimientos_df2018 %>% arrange(desc(valor))

# Gráfico de barras para las puntuaciones de emociones
library(ggplot2)
ggplot(sentimientos_df2018, aes(x = reorder(sentimiento, -valor), y = valor, fill = sentimiento)) +
  geom_bar(stat = "identity") +
  labs(title = "Puntuaciones de emociones 2018", x = "Sentimiento", y = "Frecuencia") +
  theme_minimal() + scale_fill_brewer(palette = "Set3") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotar etiquetas del eje x

Análisis de sentimientos de documentos publicados por el Instituto Nacional de Pueblos Indígenas

Yamileth Luna y Jesús Cienfuegos

2023-11-14