La minería de texto (TM) es el proceso de extraer información útil, patrones o conocimiento de textos no estructurados.
Consta de tres etapas:
library(tidyverse) # Manipulación de datos
library(tesseract) # OCR
library(magick) # PNG
library(officer) # Office (word)
library(pdftools) # PDF
library(purrr)
image1 <- image_read('C:\\Users\\ACER\\Downloads\\imagen1.PNG')
text1 <- ocr(image1)
doc1 <- read_docx()
doc1 <- doc1 %>% body_add_par(text1) # Pega el texto en el doc
# print(doc1, target = 'text1.docx')
image2 <- image_read('C:\\Users\\ACER\\Downloads\\imagen2.PNG')
tesseract_download('spa')
## Training data already exists. Overwriting C:\Users\ACER\AppData\Local\tesseract5\tesseract5\tessdata/spa.traineddata
## [1] "C:\\Users\\ACER\\AppData\\Local\\tesseract5\\tesseract5\\tessdata/spa.traineddata"
text2 <- ocr(image2, engine = tesseract('spa'))
doc2 <- read_docx()
doc2 <- doc2 %>% body_add_par(text2) # Pega el texto en el doc
# print(doc2, target = 'text2.docx')
# From pdf to text
pdf_it <- pdf_convert('C:\\Users\\ACER\\Downloads\\eso.pdf', dpi = 600) %>% map(ocr)
## Converting page 1 to eso_1.png... done!
## Converting page 2 to eso_2.png... done!
# From image to word
image_it_1 <- image_read('C:\\Users\\ACER\\Documents\\R\\eso_1.png')
text_it_1 <- ocr(image_it_1, engine = tesseract('spa'))
image_it_2 <- image_read('C:\\Users\\ACER\\Documents\\R\\eso_2.png')
text_it_2 <- ocr(image_it_2, engine = tesseract('spa'))
doc <- read_docx()
doc <- doc %>% body_add_par(text_it_1)
doc <- doc %>% body_add_par(text_it_2)
print(doc, target = 'text_it.docx')
library(syuzhet)
library(tm)
## Cargando paquete requerido: NLP
##
## Adjuntando el paquete: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
text_it <- pdf_it
text_words <- get_tokens(text_it)
emotions <- get_nrc_sentiment(text_words, language = 'spanish')
# Alegría, Tristeza, Ira, Miedo, Sorpresa, Asco, Anticipación y Confianza
barplot(colSums(prop.table(emotions[,1:8])))
sentiments <- (emotions$negative*-1)+emotions$positive
simple_plot(sentiments)