install.packages(“rJava”) library(rJava) # load and attach ‘rJava’ now install.packages(“devtools”) devtools::install_github(“ropensci/tabulizer”, args=“–no-multiarch”)
library(tabulizer)
PATH = 'C:/Users/Himanshu Poddar/Desktop/datathon/Himachal/bilaspur (h.p.) Class - 3 (Mathematics) Report Card.pdf'
lst <- extract_tables(PATH, encoding="UTF-8")
lst <- extract_tables(PATH, encoding="UTF-8",pages = 1,area = list(c(234.019,38.991,313.638,555.396))) #co-ordinates separated with comma
#Load library
library(tabulizer)
#Directory path
D_path = "C:/Users/Himanshu Poddar/Desktop/datathon/Himachal/"
files = list.files(D_path)
#looping through all the pdf files
for(file in files)
{
path = paste(D_path,file,sep = "")
df = extract_tables(path, encoding="UTF-8")
#print(df)
}
#Load library
library(tabulizer)
#Directory path
D_path = "C:/Users/Himanshu Poddar/Desktop/datathon/Himachal/"
files = list.files(D_path)
#looping through all the pdf files
for(file in files)
{
path = paste(D_path,file,sep = "")
df = extract_tables(path,pages = 1,area = list(c(234.019,38.991,313.638,555.396))) #where area is the coordinate of the table
print(df)
}
Now since all the tables are returned in a dataframe we can use R appropriate function to extract main content out of it.