La minería de texto (TM) es el proceso de extraer información útil, patrones o conocimiento de textos no estrucutrados.
Consta de 3 etapas:
Obtener datos: El Reconocimiento Óptico de Caracteres (OCR) es una tecnología que permite convertir imágenes de texto en texto editable, también conocido como extracción de texto de imágenes..
Explorar Datos: Representación gráfica o visual de los datos para su interpretación. Los métodos más comunes son el análisis de sentimientos, la nube de palabras y el topic modeling.
Análisis predictivo: Son las técnicas y modelos estadísticos para predecir resultados futuros. Los modelos más usados son Random Forest, redes neuronales y regresiones.
#install.packages("tidyverse") # Data wrangling
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#install.packages("tesseract") # OCR
library(tesseract)
#install.packages("magick") # PNG
library(magick)
## Warning: package 'magick' was built under R version 4.3.2
## Linking to ImageMagick 6.9.12.93
## Enabled features: cairo, fontconfig, freetype, heic, lcms, pango, raw, rsvg, webp
## Disabled features: fftw, ghostscript, x11
#install.packages("officer") # Exportar formatos Office (Word)
library(officer)
## Warning: package 'officer' was built under R version 4.3.2
#install.packages("pdftools") # Leer PDFs
library(pdftools)
## Using poppler version 23.04.0
#install.packages("purrr") # Funcion map(): para aplicar una funcion a cada elemento de un vector
library(purrr)
#install.packages("tm") # Text Mining
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
#install.packages("RColorBrewer") # Manejar Colores
library(RColorBrewer)
#install.packages("wordcloud") # Nube de Palabaras
library(wordcloud)
#install.packages("topicmodels") # Modelos de Temas: que nos diga de que esta hablando el texto
library(topicmodels)
#install.packages("ggplot2")
library(ggplot2)
# imagen1 <- image_read("/Users/lightedit/Documents/TEC SEMESTRE 6.1/M2/R/text mining/imagen1.PNG") # leer imagen
# texto1 <- ocr(imagen1) # Convertir imagen a texto
# doc1 <- read_docx() # Crea un docuemtno en blanco
# doc1 <- doc1 %>% body_add_par(texto1, style="Normal") # pegar texto a doc en blanco
#print(doc1, target = "texto1.2.docx") # guardar docs en la computadora, con titulo
# imagen2 <- image_read("/Users/lightedit/Documents/TEC SEMESTRE 6.1/M2/R/text mining/imagen2.PNG") # leer imagen
# tesseract_download("spa") # Bajar lenguaje de engine de tesseract
#texto2 <- ocr(imagen2, engine = tesseract("spa"))
# doc2 <- read_docx() # Crea un docuemtno en blanco
# doc2 <- doc2 %>% body_add_par(texto2, style="Normal") # pegar texto a doc en blanco
# print(doc2, target = "texto2.2.docx") # guardar docs en la computadora, con titulo
# pdf1 <- pdf_convert("/Users/lightedit/Documents/TEC SEMESTRE 6.1/M2/R/text mining/pdf1.pdf", dpi = 600) %>% map(ocr)
# convertir pdf en imagen
## Actividad 1. Novela “IT”
# pdf2 <- pdf_convert("/Users/lightedit/Documents/TEC SEMESTRE 6.1/M2/R/text mining/eso3.pdf", dpi = 600) %>% map(ocr)
# imagen3 <- image_read("/Users/lightedit/Documents/TEC SEMESTRE 6.1/M2/R/text mining/eso3_1.PNG") # leer imagen
# tesseract_download("spa") # Bajar lenguaje de engine de tesseract
# texto3 <- ocr(imagen3, engine = tesseract("spa"))
# imagen4 <- image_read("/Users/lightedit/Documents/TEC SEMESTRE 6.1/M2/R/text mining/eso3_2.PNG") # leer imagen
# texto4 <- ocr(imagen4, engine = tesseract("spa"))
# imagen5 <- image_read("/Users/lightedit/Documents/TEC SEMESTRE 6.1/M2/R/text mining/eso3_3.PNG") # leer imagen
# texto5 <- ocr(imagen5, engine = tesseract("spa")) # Guardar texto
# doc3 <- read_docx() # Crea un docuemtno en blanco
# doc3 <- doc3 %>% body_add_par(texto3, style="Normal") %>% body_add_par(texto4, style="Normal") %>% body_add_par(texto5, style="Normal")# pegar texto a doc en blanco
# print(doc3, target = "eso3.2.docx") # guardar docs en la computadora, con titulo
text <- readLines("http://www.sthda.com/sthda/RDoc/example-files/martin-luther-king-i-have-a-dream-speech.txt")
corpus <- Corpus(VectorSource(text)) # Pone cada renglon en una celda de un vector
# inspect(corpus)
corpus <- tm_map(corpus, content_transformer(tolower)) # Poner todo en minusculas
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus <- tm_map(corpus, removePunctuation) # Quitar puntuacion
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
corpus <- tm_map(corpus, removeNumbers) # Elimina numeros
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
corpus <- tm_map(corpus, removeWords, stopwords("en")) # Eliminar palabars de relleno en ingles
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("en")):
## transformation drops documents
# corpus <- tm_map(corpus, removeWords, c("dream","will")) # elimina palabras en especifico
tdm <- TermDocumentMatrix(corpus) # Cuenta las veces que aparece cada palabra por renglon
m <- as.matrix(tdm)
frecuencia <- sort(rowSums(m), decreasing = TRUE) # Cuenta la frecuencia de cada palabra en el texto
frecuencia_df <- data.frame(word=names(frecuencia), freq = frecuencia) # Convierte la frecuencia en una dataframe
ggplot(head(frecuencia_df, 10), aes(x = reorder(word, -freq), y = freq)) +
geom_bar(stat = "identity") +
geom_text(aes(label = freq), vjust = -0.5, size = 3) +
labs(title = "Top 10 Palabras Mas frecuentes",
subtitle = "Discurso 'I Have A Dream' - MLK",
x = "Palabra",
y = "Frecuencia")+
ylim(0,20)
# El procesamiento de datos antes de la nube de palabras es igual que en el analsisis de frecuencias, desde importar el texto hasta frecuencia_df
set.seed(123)
wordcloud(words=frecuencia_df$word, freq = frecuencia_df$freq, min.freq = 1,
random.order = FALSE, colors = brewer.pal(8, "Spectral")) # Creacion de wordcloud
textit <- read_docx("/Users/lightedit/Documents/TEC SEMESTRE 6.1/M2/R/text mining/eso3.2.docx")
texto_sin_acentos <- iconv(textit, to = "ASCII//TRANSLIT")
corpus2 <- Corpus(VectorSource(texto_sin_acentos)) # Put each line in a cell of a vector
inspect(corpus2)
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 15
##
## [1] /var/folders/yz/c3scz2z12ls8w840h0ldyjf00000gq/T//RtmpqFA4cu/file26152a53ce76
## [2] list(zoom = 1.8, default_tab_stop = 0.491666666666667, hyphenation_zone = 0.295138888888889, decimal_symbol = ",", list_separator = ";", even_and_odd_headers = FALSE, compatibility_mode = "15")
## [3] <environment>
## [4] list(data = character(0))
## [5] list(data = c("dc", "dc", "dc", "cp", "dc", "cp", "cp", "dcterms", "dcterms", "cp", "title", "subject", "creator", "keywords", "description", "lastModifiedBy", "revision", "created", "modified", "category", "", "", "", "", "", "", "", " xsi:type=\\"dcterms:W3CDTF\\"", " xsi:type=\\"dcterms:W3CDTF\\"", "", "", "", "", "", "", "lightedit", "9", "2017-02-28T11:18:00Z", "2024-02-26T16:36:00Z", ""), ns = c(cp = "http://schemas.openxmlformats.org/package/2006/metadata/core-properties", dc = "http://purl.org/dc/elements/1.1/", \ndcmitype = "http://purl.org/dc/dcmitype/", dcterms = "http://purl.org/dc/terms/", xsi = "http://www.w3.org/2001/XMLSchema-instance"))
## [6] <environment>
## [7] <environment>
## [8] list(style_type = c("paragraph", "paragraph", "paragraph", "paragraph", "character", "table", "numbering", "character", "paragraph", "table", "table", "character", "character", "character", "paragraph", "paragraph", "table", "paragraph", "paragraph", "paragraph", "character", "character", "paragraph", "paragraph"), style_id = c("Normal", "Titre1", "Titre2", "Titre3", "Policepardfaut", "TableauNormal", "Aucuneliste", "strong", "centered", "tabletemplate", "Listeclaire-Accent2", "Titre1Car", "Titre2Car", \n"Titre3Car", "ImageCaption", "TableCaption", "Tableauprofessionnel", "TM1", "TM2", "Textedebulles", "TextedebullesCar", "referenceid", "graphictitle", "tabletitle"), style_name = c("Normal", "heading 1", "heading 2", "heading 3", "Default Paragraph Font", "Normal Table", "No List", "strong", "centered", "table_template", "Light List Accent 2", "Titre 1 Car", "Titre 2 Car", "Titre 3 Car", "Image Caption", "Table Caption", "Table Professional", "toc 1", "toc 2", "Balloon Text", "Texte de bulles Car", \n"reference_id", "graphic title", "table title"), base_on = c(NA, "Normal", "Normal", "Normal", NA, NA, NA, "Policepardfaut", "Normal", "TableauNormal", "TableauNormal", "Policepardfaut", "Policepardfaut", "Policepardfaut", "Normal", "ImageCaption", "TableauNormal", "Normal", "Normal", "Normal", "Policepardfaut", "Policepardfaut", "ImageCaption", "TableCaption"), is_custom = c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, \nFALSE, TRUE, TRUE, TRUE, TRUE), is_default = c(TRUE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE), align = c(NA, NA, NA, NA, NA, NA, NA, NA, "center", "right", NA, NA, NA, NA, "center", NA, NA, NA, NA, NA, NA, NA, NA, NA), keep_next = c(FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, \nFALSE, FALSE), line_spacing = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), padding.bottom = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "100", "100", NA, NA, NA, NA, NA), padding.top = c(NA, "480", "200", "200", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), padding.left = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "240", NA, NA, NA, NA, NA), padding.right = c(NA, \nNA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), shading.color.par = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), border.bottom.width = c(NA, 0.5, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), border.bottom.color = c(NA, "auto", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), border.bottom.style = c(NA, "single", NA, \nNA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), border.top.width = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), border.top.color = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), border.top.style = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), border.left.width = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, \nNA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), border.left.color = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), border.left.style = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), border.right.width = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), border.right.color = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, \nNA, NA, NA, NA, NA, NA, NA, NA), border.right.style = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), font.size = c(NA, "32", "26", NA, NA, NA, NA, NA, NA, NA, NA, "32", "26", NA, NA, NA, NA, NA, NA, "18", "18", NA, NA, NA), bold = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), italic = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), underlined = c(NA, \nNA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), color = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), font.family = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Lucida Grande", "Lucida Grande", NA, NA, NA), vertical.align = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "superscript", NA, NA), shading.color = c(NA, NA, NA, NA, NA, \nNA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), hansi.family = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Lucida Grande", "Lucida Grande", NA, NA, NA), eastasia.family = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), cs.family = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), bold.cs = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, \nNA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), font.size.cs = c(NA, "32", "26", NA, NA, NA, NA, NA, NA, NA, NA, "32", "26", NA, NA, NA, NA, NA, NA, "18", "18", NA, NA, NA), lang.val = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), lang.eastasia = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), lang.bidi = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, \nNA, NA, NA))
## [9] list(nodes_names = c("p", "p", "p"), which = 3)
## [10] list()
## [11] list()
## [12] <environment>
## [13] <environment>
## [14] list(paragraph = "Normal", character = "Default Paragraph Font", table = "Normal Table", numbering = "No List")
## [15] list(page = c(width = 11900, height = 16840), landscape = FALSE, margins = c(top = 1417, bottom = 1417, left = 1417, right = 1417, header = 708, footer = 708))
corpus2 <- tm_map(corpus2, content_transformer(tolower)) # Convert all text to lowercase
## Warning in tm_map.SimpleCorpus(corpus2, content_transformer(tolower)):
## transformation drops documents
corpus2 <- tm_map(corpus2, removePunctuation) # Remove punctuation
## Warning in tm_map.SimpleCorpus(corpus2, removePunctuation): transformation
## drops documents
corpus2 <- tm_map(corpus2, removeNumbers) # Remove numbers
## Warning in tm_map.SimpleCorpus(corpus2, removeNumbers): transformation drops
## documents
corpus2 <- tm_map(corpus2, removeWords, stopwords("spanish")) # Remove Spanish stopwords
## Warning in tm_map.SimpleCorpus(corpus2, removeWords, stopwords("spanish")):
## transformation drops documents
# corpus <- tm_map(corpus, removeWords, c("dream","will")) # Remove specific words
tdm2 <- TermDocumentMatrix(corpus2) # Count the number of times each word appears per line
m2 <- as.matrix(tdm2)
frecuencia2 <- sort(rowSums(m2), decreasing = TRUE) # Cuenta la frecuencia de cada palabra en el texto
frecuencia_df2 <- data.frame(word=names(frecuencia2), freq = frecuencia2) # Convierte la frecuencia en una dataframe
ggplot(head(frecuencia_df2, 10), aes(x = reorder(word, -freq), y = freq)) +
geom_bar(stat = "identity") +
geom_text(aes(label = freq), vjust = -0.5, size = 3) +
labs(title = "Top 10 Palabras Mas frecuentes",
subtitle = "Discurso 'I Have A Dream' - MLK",
x = "Palabra",
y = "Frecuencia")+
ylim(0,20)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_text()`).
# El procesamiento de datos antes de la nube de palabras es igual que en el analsisis de frecuencias, desde importar el texto hasta frecuencia_df
set.seed(123)
wordcloud(words=frecuencia_df$word, freq = frecuencia_df$freq, min.freq = 1,
random.order = FALSE, colors = brewer.pal(8, "Spectral")) # Creacion de wordcloud