
Llamar librerias
library(tesseract)
library(magick)
## Warning: package 'magick' was built under R version 4.3.3
## Linking to ImageMagick 6.9.12.93
## Enabled features: cairo, fontconfig, freetype, heic, lcms, pango, raw, rsvg, webp
## Disabled features: fftw, ghostscript, x11
library(officer)
library(pdftools)
## Using poppler version 23.04.0
library(purrr)
Obtener texto de un PDF
#file.choose()
pdf1 <- pdf_convert("/Users/luisangel/Library/CloudStorage/OneDrive-InstitutoTecnologicoydeEstudiosSuperioresdeMonterrey/7th Season/M2/eso3.pdf",dpi=600) %>% map(ocr)
## Converting page 1 to eso3_1.png... done!
## Converting page 2 to eso3_2.png... done!
## Converting page 3 to eso3_3.png... done!
imagen1 <- image_read("/Users/luisangel/Library/CloudStorage/OneDrive-InstitutoTecnologicoydeEstudiosSuperioresdeMonterrey/7th Season/M2/eso3_1.png")
imagen2 <- image_read("/Users/luisangel/Library/CloudStorage/OneDrive-InstitutoTecnologicoydeEstudiosSuperioresdeMonterrey/7th Season/M2/eso3_2.png")
imagen3 <- image_read("/Users/luisangel/Library/CloudStorage/OneDrive-InstitutoTecnologicoydeEstudiosSuperioresdeMonterrey/7th Season/M2/eso3_3.png")
#PDF
texto1 <- ocr(imagen1, engine = tesseract("spa"))
texto2 <- ocr(imagen2, engine = tesseract("spa"))
texto3 <- ocr(imagen3, engine = tesseract("spa"))
Guardar texto en WORD
texto_completo <- paste(texto1, texto2, texto3, sep = " ")
doc1 <- read_docx()
doc1 <- doc1 %>% body_add_par(texto_completo, style = "Normal")
print(doc1, target ="ESO.docx")
CONCLUSIONES
Considero que es una funcion bastante util para guardar texto y seria
muy util poder complementar con otras funciones que agilicen el
proceso.
LS0tCnRpdGxlOiAiRVNPIgphdXRob3I6ICJMdWlzIEFuZ2VsIERpYXoiCmRhdGU6ICIyMDI0LTA4LTE0IgpvdXRwdXQ6IAogIGh0bWxfZG9jdW1lbnQ6CiAgICB0b2M6IFRSVUUKICAgIHRvY19mbG9hdDogVFJVRQogICAgY29kZV9kb3dubG9hZDogVFJVRQogICAgdGhlbWU6IGRhcmsKLS0tCgpgYGB7ciBzZXR1cCwgaW5jbHVkZT1GQUxTRX0Ka25pdHI6Om9wdHNfY2h1bmskc2V0KGVjaG8gPSBUUlVFKQpgYGAKIVtdKC9Vc2Vycy9sdWlzYW5nZWwvTGlicmFyeS9DbG91ZFN0b3JhZ2UvT25lRHJpdmUtSW5zdGl0dXRvVGVjbm9sb2dpY295ZGVFc3R1ZGlvc1N1cGVyaW9yZXNkZU1vbnRlcnJleS83dGggU2Vhc29uL00yL0VTTy5naWYpCgojIExsYW1hciBsaWJyZXJpYXMKYGBge3J9CmxpYnJhcnkodGVzc2VyYWN0KQpsaWJyYXJ5KG1hZ2ljaykKbGlicmFyeShvZmZpY2VyKQpsaWJyYXJ5KHBkZnRvb2xzKQpsaWJyYXJ5KHB1cnJyKQpgYGAKCiMgT2J0ZW5lciB0ZXh0byBkZSB1biBQREYKYGBge3J9CiNmaWxlLmNob29zZSgpCnBkZjEgPC0gcGRmX2NvbnZlcnQoIi9Vc2Vycy9sdWlzYW5nZWwvTGlicmFyeS9DbG91ZFN0b3JhZ2UvT25lRHJpdmUtSW5zdGl0dXRvVGVjbm9sb2dpY295ZGVFc3R1ZGlvc1N1cGVyaW9yZXNkZU1vbnRlcnJleS83dGggU2Vhc29uL00yL2VzbzMucGRmIixkcGk9NjAwKSAlPiUgbWFwKG9jcikKYGBgCmBgYHtyfQppbWFnZW4xIDwtIGltYWdlX3JlYWQoIi9Vc2Vycy9sdWlzYW5nZWwvTGlicmFyeS9DbG91ZFN0b3JhZ2UvT25lRHJpdmUtSW5zdGl0dXRvVGVjbm9sb2dpY295ZGVFc3R1ZGlvc1N1cGVyaW9yZXNkZU1vbnRlcnJleS83dGggU2Vhc29uL00yL2VzbzNfMS5wbmciKQoKaW1hZ2VuMiA8LSBpbWFnZV9yZWFkKCIvVXNlcnMvbHVpc2FuZ2VsL0xpYnJhcnkvQ2xvdWRTdG9yYWdlL09uZURyaXZlLUluc3RpdHV0b1RlY25vbG9naWNveWRlRXN0dWRpb3NTdXBlcmlvcmVzZGVNb250ZXJyZXkvN3RoIFNlYXNvbi9NMi9lc28zXzIucG5nIikKCmltYWdlbjMgPC0gaW1hZ2VfcmVhZCgiL1VzZXJzL2x1aXNhbmdlbC9MaWJyYXJ5L0Nsb3VkU3RvcmFnZS9PbmVEcml2ZS1JbnN0aXR1dG9UZWNub2xvZ2ljb3lkZUVzdHVkaW9zU3VwZXJpb3Jlc2RlTW9udGVycmV5Lzd0aCBTZWFzb24vTTIvZXNvM18zLnBuZyIpCmBgYAoKI1BERgpgYGB7cn0KdGV4dG8xIDwtIG9jcihpbWFnZW4xLCBlbmdpbmUgPSB0ZXNzZXJhY3QoInNwYSIpKQp0ZXh0bzIgPC0gb2NyKGltYWdlbjIsIGVuZ2luZSA9IHRlc3NlcmFjdCgic3BhIikpCnRleHRvMyA8LSBvY3IoaW1hZ2VuMywgZW5naW5lID0gdGVzc2VyYWN0KCJzcGEiKSkKYGBgCgojIEd1YXJkYXIgdGV4dG8gZW4gV09SRApgYGB7cn0KdGV4dG9fY29tcGxldG8gPC0gcGFzdGUodGV4dG8xLCB0ZXh0bzIsIHRleHRvMywgc2VwID0gIiAiKQoKZG9jMSA8LSByZWFkX2RvY3goKQpkb2MxIDwtIGRvYzEgJT4lIGJvZHlfYWRkX3Bhcih0ZXh0b19jb21wbGV0bywgc3R5bGUgPSAiTm9ybWFsIikKcHJpbnQoZG9jMSwgdGFyZ2V0ID0iRVNPLmRvY3giKQpgYGAKCiMgQ09OQ0xVU0lPTkVTCkNvbnNpZGVybyBxdWUgZXMgdW5hIGZ1bmNpb24gYmFzdGFudGUgdXRpbCBwYXJhIGd1YXJkYXIgdGV4dG8geSBzZXJpYSBtdXkgdXRpbCBwb2RlciBjb21wbGVtZW50YXIgY29uIG90cmFzIGZ1bmNpb25lcyBxdWUgYWdpbGljZW4gZWwgcHJvY2Vzby4=