
library(tesseract)
library(magick)
## Linking to ImageMagick 6.9.12.98
## Enabled features: cairo, freetype, fftw, ghostscript, heic, lcms, pango, raw, rsvg, webp
## Disabled features: fontconfig, x11
library(officer)
library(pdftools)
## Using poppler version 23.08.0
library(purrr)
# obtener texto de una imagen en PNG
pdf1 <- pdf_convert("C:\\Users\\lcbor\\Downloads\\eso3.pdf",dpi = 600) %>% map(ocr)
## Converting page 1 to eso3_1.png... done!
## Converting page 2 to eso3_2.png... done!
## Converting page 3 to eso3_3.png... done!
imagen1 <- image_read("C:\\Users\\lcbor\\Documents\\eso3_1.png")
imagen2 <- image_read("C:\\Users\\lcbor\\Documents\\eso3_2.png")
imagen3 <- image_read("C:\\Users\\lcbor\\Documents\\eso3_3.png")
tesseract_download("spa")
## [1] "C:\\Users\\lcbor\\AppData\\Local\\tesseract5\\tesseract5\\tessdata/spa.traineddata"
texto1 <- ocr(imagen1, engine = tesseract("spa"))
texto2 <- ocr(imagen2, engine = tesseract("spa"))
texto3 <- ocr(imagen3, engine = tesseract("spa"))
# Guardar texto en word
doc1 <- read_docx()
doc1 <- doc1 %>% body_add_par(texto1, style = "Normal")
print(doc1,target = "pdfIT_1.docx")
doc2 <- read_docx()
doc2 <- doc2 %>% body_add_par(texto2, style = "Normal")
print(doc2,target = "pdfIT_2.docx")
doc3 <- read_docx()
doc3 <- doc3 %>% body_add_par(texto3, style = "Normal")
print(doc3,target = "pdfIT_3.docx")
# Este codigo es capaz de traducir el texto de IT de imagen a texto de forma rapida
LS0tDQp0aXRsZTogIk9DUl9JVCINCmF1dGhvcjogIkx1aXMgQ2FybG9zIEJvcmJvbiBNYXJ0aW5leiINCmRhdGU6ICIyMDI0LTA4LTE0Ig0Kb3V0cHV0Og0KICBodG1sX2RvY3VtZW50Og0KICAgIHRvYzogdHJ1ZQ0KICAgIHRvY19mb2F0OiB0cnVlDQogICAgY29kZV9kb3dubG9hZDogdHJ1ZQ0KICAgIHRoZW1lOiBkYXJrDQotLS0NCiFbXShDOlxcVXNlcnNcXGxjYm9yXFxEb2N1bWVudHNcXE1WNUJaRFZrWm1JMFl6QXROemRqWWkwMFpqaGhMV0UxT0RFdE1XTXpNV016TkRBME5tUTRYa0V5WGtGcWNHZGVRWFZ5TnpZek9ETTNNemdALl9WMV9RTDc1X1VYMTkwX0NSMCwyLDE5MCwyODFfLmpwZykNCg0KDQpgYGB7cn0NCmxpYnJhcnkodGVzc2VyYWN0KQ0KbGlicmFyeShtYWdpY2spDQpsaWJyYXJ5KG9mZmljZXIpDQpsaWJyYXJ5KHBkZnRvb2xzKQ0KbGlicmFyeShwdXJycikNCg0KIyBvYnRlbmVyIHRleHRvIGRlIHVuYSBpbWFnZW4gZW4gUE5HDQpwZGYxIDwtIHBkZl9jb252ZXJ0KCJDOlxcVXNlcnNcXGxjYm9yXFxEb3dubG9hZHNcXGVzbzMucGRmIixkcGkgPSA2MDApICU+JSBtYXAob2NyKQ0KDQppbWFnZW4xIDwtIGltYWdlX3JlYWQoIkM6XFxVc2Vyc1xcbGNib3JcXERvY3VtZW50c1xcZXNvM18xLnBuZyIpDQppbWFnZW4yIDwtIGltYWdlX3JlYWQoIkM6XFxVc2Vyc1xcbGNib3JcXERvY3VtZW50c1xcZXNvM18yLnBuZyIpDQppbWFnZW4zIDwtIGltYWdlX3JlYWQoIkM6XFxVc2Vyc1xcbGNib3JcXERvY3VtZW50c1xcZXNvM18zLnBuZyIpDQoNCnRlc3NlcmFjdF9kb3dubG9hZCgic3BhIikNCnRleHRvMSA8LSBvY3IoaW1hZ2VuMSwgZW5naW5lID0gdGVzc2VyYWN0KCJzcGEiKSkNCnRleHRvMiA8LSBvY3IoaW1hZ2VuMiwgZW5naW5lID0gdGVzc2VyYWN0KCJzcGEiKSkNCnRleHRvMyA8LSBvY3IoaW1hZ2VuMywgZW5naW5lID0gdGVzc2VyYWN0KCJzcGEiKSkNCg0KIyBHdWFyZGFyIHRleHRvIGVuIHdvcmQNCmRvYzEgPC0gcmVhZF9kb2N4KCkNCmRvYzEgPC0gZG9jMSAlPiUgYm9keV9hZGRfcGFyKHRleHRvMSwgc3R5bGUgPSAiTm9ybWFsIikNCnByaW50KGRvYzEsdGFyZ2V0ID0gInBkZklUXzEuZG9jeCIpDQoNCmRvYzIgPC0gcmVhZF9kb2N4KCkNCmRvYzIgPC0gZG9jMiAlPiUgYm9keV9hZGRfcGFyKHRleHRvMiwgc3R5bGUgPSAiTm9ybWFsIikNCnByaW50KGRvYzIsdGFyZ2V0ID0gInBkZklUXzIuZG9jeCIpDQoNCmRvYzMgPC0gcmVhZF9kb2N4KCkNCmRvYzMgPC0gZG9jMyAlPiUgYm9keV9hZGRfcGFyKHRleHRvMywgc3R5bGUgPSAiTm9ybWFsIikNCnByaW50KGRvYzMsdGFyZ2V0ID0gInBkZklUXzMuZG9jeCIpDQoNCiMgRXN0ZSBjb2RpZ28gZXMgY2FwYXogZGUgdHJhZHVjaXIgZWwgdGV4dG8gZGUgSVQgZGUgaW1hZ2VuIGEgdGV4dG8gZGUgZm9ybWEgcmFwaWRhDQpgYGA=