extract-all-table-from-PDF.R

rm(list = ls())
################# ##############input data 
#install.packages("tabulizer")
f2 <- "https://github.com/leeper/tabulizer/raw/master/inst/examples/data.pdf"
#two tables in page 2
tabulizer::extract_tables(f2, pages = 2, method = "decide")

## [[1]]
##      [,1]           [,2]          [,3]           [,4]          [,5]     
## [1,] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
## [2,] "5.1"          "3.5"         "1.4"          "0.2"         "setosa" 
## [3,] "4.9"          "3.0"         "1.4"          "0.2"         "setosa" 
## [4,] "4.7"          "3.2"         "1.3"          "0.2"         "setosa" 
## [5,] "4.6"          "3.1"         "1.5"          "0.2"         "setosa" 
## [6,] "5.0"          "3.6"         "1.4"          "0.2"         "setosa" 
## [7,] "5.4"          "3.9"         "1.7"          "0.4"         "setosa" 
## 
## [[2]]
##      [,1]  [,2]           [,3]          [,4]           [,5]         
## [1,] ""    "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## [2,] "145" "6.7"          "3.3"         "5.7"          "2.5"        
## [3,] "146" "6.7"          "3.0"         "5.2"          "2.3"        
## [4,] "147" "6.3"          "2.5"         "5.0"          "1.9"        
## [5,] "148" "6.5"          "3.0"         "5.2"          "2.0"        
## [6,] "149" "6.2"          "3.4"         "5.4"          "2.3"        
## [7,] "150" "5.9"          "3.0"         "5.1"          "1.8"        
##      [,6]       
## [1,] "Species"  
## [2,] "virginica"
## [3,] "virginica"
## [4,] "virginica"
## [5,] "virginica"
## [6,] "virginica"
## [7,] "virginica"

tabulizer::extract_text(f2, pages = 2)

## [1] "Sepal.Length Sepal.Width Petal.Length Petal.Width Species\r\n5.1 3.5 1.4 0.2 setosa\r\n4.9 3.0 1.4 0.2 setosa\r\n4.7 3.2 1.3 0.2 setosa\r\n4.6 3.1 1.5 0.2 setosa\r\n5.0 3.6 1.4 0.2 setosa\r\n5.4 3.9 1.7 0.4 setosa\r\nSepal.Length Sepal.Width Petal.Length Petal.Width Species\r\n145 6.7 3.3 5.7 2.5 virginica\r\n146 6.7 3.0 5.2 2.3 virginica\r\n147 6.3 2.5 5.0 1.9 virginica\r\n148 6.5 3.0 5.2 2.0 virginica\r\n149 6.2 3.4 5.4 2.3 virginica\r\n150 5.9 3.0 5.1 1.8 virginica\r\n2\r\n"

tabulizer::make_thumbnails(file = f2, outdir = getwd(), pages = 2,resolution = 600)

## [1] "C:\\Users\\liyix\\OneDrive\\Desktop\\file68e42d673c112.png"

?make_thumbnails

## starting httpd help server ... done

#extract_text() converts the text of an entire file or specified pages into an R character vector.
#split_pdf() and merge_pdfs() split and merge PDF documents, respectively.
#extract_metadata() extracts PDF metadata as a list.
#get_n_pages() determines the number of pages in a document.
#get_page_dims() determines the width and height of each page in pt (the unit used by area and columns arguments).
#make_thumbnails() converts specified pages of a PDF file to image files.
#ref https://cran.r-project.org/web/packages/tabulizer/vignettes/tabulizer.html

extract-all-table-from-PDF.R

liyix

2020-10-13