La minerÃa de texto (TM) es el proceso de extraer información útil, patrones o conocimiento de textos no estructurados
Consta de 3 etapas:
1. Obtener datos: El reconocimiento óptico de caracteres (OCR) es una
tecnolgÃa que permite convertir imágenes de texto en texto
editable.También es conocido como extracción de texto de
imágenes.
2. Explorar datos: Representación gráfica o visual de los datos para su
interpretación. Los métodos más comunes son el Análisis de Sentimientos,
la Nube de Palabras y el Topic Modeling.
3. Análisis Predictivo: Son las técnicas y modelos estadisticos para
predecir resultados futuros. Los modelos más usados son el Random
Forest, redes neuronales y regresiones.
#install.packages("tidyverse") #Data Wrangling
library(tidyverse)
#install.packages("tesseract") #OCR
library(tesseract)
#install.packages("magick") #PNG
library (magick)
#install.packages("officer") #Office(Word)
library(officer)
#install.packages("pdftools") #PDF
library(pdftools)
#install.packages(purrr) #Para la función map
library(purrr)
#install.packages("tm") #Text Mining
library(tm)
#install.packages("RColorBrewer") #Colores
library(RColorBrewer)
#install.packages("wordcloud") # Nube de Palabras
library(wordcloud)
#install.packages("topicmodels") #Modelos de Temas
library(topicmodels)
library(ggplot2)
imagen1 <- image_read("C:\\Users\\Luis Rodriguez\\Downloads\\imagen1.PNG")
texto1 <- ocr(imagen1)
texto1
## [1] "Linear regression with one variable x is also known as univariate linear regression\nor simple linear regression. Simple linear regression is used to predict a single\noutput from a single input. This is an example of supervised learning, which means\nthat the data is labeled, i.e., the output values are known in the training data. Let us\nfit a line through the data using simple linear regression as shown in Fig. 4.1.\n"
doc1<- read_docx() # Crea un documentos de word en blanco
doc1 <- doc1 %>% body_add_par(texto1, style = "Normal") #Pega el texto en el word
print (doc1, target ="texto.docx") # Guarda el word en la computadora
imagen2 <- image_read("C:\\Users\\Luis Rodriguez\\Downloads\\imagen2.PNG")
tesseract_download("spa")
## [1] "C:\\Users\\Luis Rodriguez\\AppData\\Local\\tesseract5\\tesseract5\\tessdata/spa.traineddata"
texto2 <- ocr (imagen2, engine = tesseract("spa"))
texto2
## [1] "Un importante, y quizá controversial, asunto polÃtico es el que se refiere al efecto del salario mÃnimo sobre\nlas tasas de desempleo en diversos grupos de trabajadores. Aunque este problema puede ser estudiado con\ndiversos tipos de datos (corte transversal, series de tiempo o datos de panel), suelen usarse las series de\ntiempo para observar los efectos agregados. En la tabla 1.3 se presenta un ejemplo de una base de datos\nde series de tiempo sobre tasas de desempleo y salarios mÃnimos.\n"
doc2<- read_docx() # Crea un documentos de word en blanco
doc2 <- doc2 %>% body_add_par(texto2, style = "Normal") #Pega el texto en el word
print (doc2, target ="texto2.docx") # Guarda el word en la computadora
pdf1 <- pdf_convert("C:\\Users\\Luis Rodriguez\\Downloads\\pdf1.pdf", dpi = 600) %>% map(ocr)
## Converting page 1 to pdf1_1.png... done!
## Converting page 2 to pdf1_2.png... done!
## Converting page 3 to pdf1_3.png... done!
## Converting page 4 to pdf1_4.png... done!
## Converting page 5 to pdf1_5.png... done!
## Converting page 6 to pdf1_6.png... done!
## Converting page 7 to pdf1_7.png... done!
## Converting page 8 to pdf1_8.png... done!
pdf2 <- pdf_convert("C:\\Users\\Luis Rodriguez\\Downloads\\eso3.pdf", dpi = 600) %>% map(ocr)
## Converting page 1 to eso3_1.png... done!
## Converting page 2 to eso3_2.png... done!
## Converting page 3 to eso3_3.png... done!
imagen3 <- ("C:\\Users\\Luis Rodriguez\\Documents\\eso3_1.png")
tesseract_download("spa")
## [1] "C:\\Users\\Luis Rodriguez\\AppData\\Local\\tesseract5\\tesseract5\\tessdata/spa.traineddata"
texto3 <- ocr(imagen3, engine = tesseract("spa"))
imagen4 <- ("C:\\Users\\Luis Rodriguez\\Documents\\eso3_2.png")
tesseract_download("spa")
## [1] "C:\\Users\\Luis Rodriguez\\AppData\\Local\\tesseract5\\tesseract5\\tessdata/spa.traineddata"
texto4 <- ocr(imagen4, engine = tesseract("spa"))
imagen5 <- ("C:\\Users\\Luis Rodriguez\\Documents\\eso3_3.png")
tesseract_download("spa")
## [1] "C:\\Users\\Luis Rodriguez\\AppData\\Local\\tesseract5\\tesseract5\\tessdata/spa.traineddata"
texto5 <- ocr(imagen5, engine = tesseract("spa"))
doc3<- read_docx() # Crea un documentos de word en blanco
doc3 <- doc3 %>%
body_add_par(texto3, style = "Normal") %>%
body_add_par(texto4, style = "Normal") %>%
body_add_par(texto5, style = "Normal")
print (doc3, target ="eso.docx") # Guarda el word en la computadora
text <- readLines("http://www.sthda.com/sthda/RDoc/example-files/martin-luther-king-i-have-a-dream-speech.txt")
corpus <- Corpus(VectorSource(text))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
#corpus <- tm_map (corpus, removeWords, c("dream", "will"))
tdm <- TermDocumentMatrix(corpus)
m <- as.matrix(tdm)
frecuencia <- sort(rowSums(m), decreasing = TRUE)
frecuencia_df <- data.frame(word=names(frecuencia),freq =frecuencia)
ggplot(head(frecuencia_df,10), aes(x=reorder(word, -freq), y=freq)) +
geom_bar(stat="identity", fill = "lightblue") +
geom_text(aes(label = freq), vjust = -0.5) +
labs(title="TOP 10 palabras más frecuentes",
subtitle = "Discurso 'I have a Dream' de M.L King", x = "Palabra", y = "Frecuencia") +
ylim(0,20)
inspect(corpus)
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 46
##
## [1]
## [2] even though face difficulties today tomorrow still dream dream deeply rooted american dream
## [3]
## [4] dream one day nation will rise live true meaning creed
## [5]
## [6] hold truths selfevident men created equal
## [7]
## [8] dream one day red hills georgia sons former slaves sons former slave owners will able sit together table brotherhood
## [9]
## [10] dream one day even state mississippi state sweltering heat injustice sweltering heat oppression will transformed oasis freedom justice
## [11]
## [12] dream four little children will one day live nation will judged color skin content character
## [13]
## [14] dream today
## [15]
## [16] dream one day alabama vicious racists governor lips dripping words interposition nullification one day right alabama little black boys black girls will able join hands little white boys white girls sisters brothers
## [17]
## [18] dream today
## [19]
## [20] dream one day every valley shall exalted every hill mountain shall made low rough places will made plain crooked places will made straight glory lord shall revealed flesh shall see together
## [21]
## [22] hope faith go back south
## [23]
## [24] faith will able hew mountain despair stone hope faith will able transform jangling discords nation beautiful symphony brotherhood faith will able work together pray together struggle together go jail together stand freedom together knowing will free one day
## [25]
## [26] will day will day god s children will able sing new meaning
## [27]
## [28] country tis thee sweet land liberty thee sing
## [29] land fathers died land pilgrim s pride
## [30] every mountainside let freedom ring
## [31] america great nation must become true
## [32] let freedom ring prodigious hilltops new hampshire
## [33] let freedom ring mighty mountains new york
## [34] let freedom ring heightening alleghenies pennsylvania
## [35] let freedom ring snowcapped rockies colorado
## [36] let freedom ring curvaceous slopes california
## [37]
## [38]
## [39] let freedom ring stone mountain georgia
## [40] let freedom ring lookout mountain tennessee
## [41] let freedom ring every hill molehill mississippi
## [42] every mountainside let freedom ring
## [43] happens allow freedom ring let ring every village every hamlet every state every city will able speed day god s children black men white men jews gentiles protestants catholics will able join hands sing words old negro spiritual
## [44] free last free last
## [45]
## [46] thank god almighty free last
# El procesamiento de datos antes de la nube de palabras es igual que en el Análisis de Frecuencias, desde importar el texto hasta frecuencia_df
set.seed(123)
wordcloud( words = frecuencia_df$word, freq = frecuencia_df$freq, min.freq=1,random.order=FALSE, colors = brewer.pal(8, "RdPu"))
#texto_sin_acentos <- iconv(text1, to = "ASCII // TRANSLIT")
texto_it <- readLines("C:\\Users\\Luis Rodriguez\\Downloads\\eso3-_1_.txt")
texto_it2 <- iconv(texto_it, to = "UTF-8", sub = "byte")
corpus1 <- Corpus(VectorSource(texto_it2))
corpus1 <- tm_map(corpus1, content_transformer(tolower)) # Pone todo en minúsculas
corpus1 <- tm_map(corpus1, removePunctuation) # Elimina puntuación
corpus1 <- tm_map(corpus1, removeNumbers) # Elimina números
corpus1 <- tm_map(corpus1, removeWords, stopwords("spanish")) # Elimina palabras que no hablen del tema
tdm1 <- TermDocumentMatrix(corpus1)
m1 <- as.matrix(tdm1)
frecuencia1 <- sort(rowSums(m1), decreasing = TRUE)
frecuencia_df1 <- data.frame(word=names(frecuencia1),freq =frecuencia1)
ggplot(head(frecuencia_df1,10), aes(x=reorder(word, -freq), y=freq)) +
geom_bar(stat="identity", fill = "lightblue") +
geom_text(aes(label = freq), vjust = -0.5) +
labs(title="TOP 10 palabras más frecuentes",
subtitle = "It", x = "Palabra", y = "Frecuencia") +
ylim(0,35)
#inspect(corpus1)
set.seed(123)
wordcloud(words=frecuencia_df1$word,freq=frecuencia_df1$freq, min.freq=2,
random.order=FALSE, colors = brewer.pal(8, "Blues"))