Instalar los paquetes necesarios

install.packages(c("tm", "SnowballC", "wordcloud", "pdftools"))
Error in install.packages : Updating loaded packages

Cargar las librerias

library(tm)
Aviso: package ‘tm’ was built under R version 4.4.3Cargando paquete requerido: NLP
Aviso: package ‘NLP’ was built under R version 4.4.2
library(SnowballC)
library(wordcloud)
Aviso: package ‘wordcloud’ was built under R version 4.4.3Cargando paquete requerido: RColorBrewer
library(pdftools)
Aviso: package ‘pdftools’ was built under R version 4.4.3Using poppler version 23.08.0

Leer el texto del archivo en PDF

pdf_file <- file.choose()
texto_pdf <- pdf_text(pdf_file)

Crear un objeto Corpus

corpus <- Corpus(VectorSource(texto_pdf))

Limpiar el Texto

corpus <- tm_map(corpus, tolower) # Convertir a minúsculas
Aviso: transformation drops documents
corpus <- tm_map(corpus, removeNumbers) # Eliminar números
Aviso: transformation drops documents
corpus <- tm_map(corpus, removePunctuation) # Eliminar puntuación
Aviso: transformation drops documents
corpus <- tm_map(corpus, stripWhitespace) # Eliminar espacios en blanco
Aviso: transformation drops documents
corpus <- tm_map(corpus, removeWords, words = stopwords("spanish")) # Eliminar stopwords
Aviso: transformation drops documents

Steamming (opcional)

corpus <- tm_map(corpus, stemDocument, language = "spanish") # Stemming en español
Aviso: transformation drops documents

Crear una matriz de términos-documento (TDM)

tdm <- TermDocumentMatrix(corpus, control = list(
  wordLengths = c(1, Inf) # Considera palabras de cualquier longitud
))

# Convertir la TDM a una matriz
matriz_tdm <- as.matrix(tdm)

# Obtener la frecuencia de palabras
frecuencia_palabras <- sort(rowSums(matriz_tdm), decreasing = TRUE)

Crear la nube de palabras

wordcloud(names(frecuencia_palabras), frecuencia_palabras,
  max.words = 50, random.order = FALSE, colors = brewer.pal(name = "Dark2", n = 8))

LS0tDQp0aXRsZTogIk1lbnNhamUgYSBsYSBOYWNpw7NuIDIwMjQiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpJbnN0YWxhciBsb3MgcGFxdWV0ZXMgbmVjZXNhcmlvcw0KDQpgYGB7cn0NCmluc3RhbGwucGFja2FnZXMoYygidG0iLCAiU25vd2JhbGxDIiwgIndvcmRjbG91ZCIsICJwZGZ0b29scyIpKQ0KYGBgDQoNCkNhcmdhciBsYXMgbGlicmVyaWFzDQoNCmBgYHtyfQ0KbGlicmFyeSh0bSkNCmxpYnJhcnkoU25vd2JhbGxDKQ0KbGlicmFyeSh3b3JkY2xvdWQpDQpsaWJyYXJ5KHBkZnRvb2xzKQ0KYGBgDQoNCg0KTGVlciBlbCB0ZXh0byBkZWwgYXJjaGl2byBlbiBQREYNCmBgYHtyfQ0KcGRmX2ZpbGUgPC0gZmlsZS5jaG9vc2UoKQ0KdGV4dG9fcGRmIDwtIHBkZl90ZXh0KHBkZl9maWxlKQ0KYGBgDQoNCkNyZWFyIHVuIG9iamV0byBDb3JwdXMNCg0KYGBge3J9DQpjb3JwdXMgPC0gQ29ycHVzKFZlY3RvclNvdXJjZSh0ZXh0b19wZGYpKQ0KYGBgDQoNCkxpbXBpYXIgZWwgVGV4dG8NCmBgYHtyfQ0KY29ycHVzIDwtIHRtX21hcChjb3JwdXMsIHRvbG93ZXIpICMgQ29udmVydGlyIGEgbWluw7pzY3VsYXMNCmNvcnB1cyA8LSB0bV9tYXAoY29ycHVzLCByZW1vdmVOdW1iZXJzKSAjIEVsaW1pbmFyIG7Dum1lcm9zDQpjb3JwdXMgPC0gdG1fbWFwKGNvcnB1cywgcmVtb3ZlUHVuY3R1YXRpb24pICMgRWxpbWluYXIgcHVudHVhY2nDs24NCmNvcnB1cyA8LSB0bV9tYXAoY29ycHVzLCBzdHJpcFdoaXRlc3BhY2UpICMgRWxpbWluYXIgZXNwYWNpb3MgZW4gYmxhbmNvDQpjb3JwdXMgPC0gdG1fbWFwKGNvcnB1cywgcmVtb3ZlV29yZHMsIHdvcmRzID0gc3RvcHdvcmRzKCJzcGFuaXNoIikpICMgRWxpbWluYXIgc3RvcHdvcmRzDQpgYGANCg0KU3RlYW1taW5nIChvcGNpb25hbCkNCg0KYGBge3J9DQpjb3JwdXMgPC0gdG1fbWFwKGNvcnB1cywgc3RlbURvY3VtZW50LCBsYW5ndWFnZSA9ICJzcGFuaXNoIikgIyBTdGVtbWluZyBlbiBlc3Bhw7FvbA0KYGBgDQoNCkNyZWFyIHVuYSBtYXRyaXogZGUgdMOpcm1pbm9zLWRvY3VtZW50byAoVERNKQ0KYGBge3J9DQp0ZG0gPC0gVGVybURvY3VtZW50TWF0cml4KGNvcnB1cywgY29udHJvbCA9IGxpc3QoDQogIHdvcmRMZW5ndGhzID0gYygxLCBJbmYpICMgQ29uc2lkZXJhIHBhbGFicmFzIGRlIGN1YWxxdWllciBsb25naXR1ZA0KKSkNCg0KIyBDb252ZXJ0aXIgbGEgVERNIGEgdW5hIG1hdHJpeg0KbWF0cml6X3RkbSA8LSBhcy5tYXRyaXgodGRtKQ0KDQojIE9idGVuZXIgbGEgZnJlY3VlbmNpYSBkZSBwYWxhYnJhcw0KZnJlY3VlbmNpYV9wYWxhYnJhcyA8LSBzb3J0KHJvd1N1bXMobWF0cml6X3RkbSksIGRlY3JlYXNpbmcgPSBUUlVFKQ0KDQpgYGANCg0KQ3JlYXIgbGEgbnViZSBkZSBwYWxhYnJhcw0KDQpgYGB7cn0NCndvcmRjbG91ZChuYW1lcyhmcmVjdWVuY2lhX3BhbGFicmFzKSwgZnJlY3VlbmNpYV9wYWxhYnJhcywNCiAgbWF4LndvcmRzID0gNTAsIHJhbmRvbS5vcmRlciA9IEZBTFNFLCBjb2xvcnMgPSBicmV3ZXIucGFsKG5hbWUgPSAiRGFyazIiLCBuID0gOCkpDQpgYGANCg0K