#deal with pdf
rm(list = ls())
#install.packages("pdftools")
# Load pdftools
library(pdftools)
## Using poppler version 0.73.0
##############1.1  extract some pages
pdf_subset('v7.pdf',
           pages = 68:81, output = "subset.pdf")
## [1] "C:\\Users\\liyix\\OneDrive\\Desktop\\subset.pdf"
# Should say 3
pdf_length("subset.pdf") #[1] 14
## [1] 14
########################1.2 combine
# Generate another pdf
pdf("test.pdf")
plot(mtcars)
dev.off()
## png 
##   2
# Combine them with the other one
pdf_combine(c("test.pdf", "subset.pdf"), output = "joined.pdf")
## [1] "C:\\Users\\liyix\\OneDrive\\Desktop\\joined.pdf"
# Should say 4
pdf_length("joined.pdf") #[1] 15
## [1] 15
##################################1.3 combine to one pdf
pdf("myOut.pdf")
for (i in 1:10){
  plot(rnorm(10))
}
dev.off()
## png 
##   2
###############################search key words
#install.packages("pdfsearch")
library(pdfsearch)
system.file()
## [1] "C:/PROGRA~1/R/R-40~1.2/library/base"
getwd()
## [1] "C:/Users/liyix/OneDrive/Desktop"
file <- 'C:/Users/liyix/OneDrive/Desktop/1610.00147.pdf'

result <- keyword_search(file, 
                         keyword = c('measurement', 'error'),
                         path = TRUE, surround_lines = 1)
head(result)
## # A tibble: 6 x 5
##   keyword     page_num line_num line_text token_text
##   <chr>          <int>    <int> <list>    <list>    
## 1 measurement        1        2 <chr [3]> <list [3]>
## 2 measurement        1        4 <chr [3]> <list [3]>
## 3 measurement        1       10 <chr [3]> <list [3]>
## 4 measurement        1       12 <chr [3]> <list [3]>
## 5 measurement        1       15 <chr [3]> <list [3]>
## 6 measurement        1       17 <chr [3]> <list [3]>
head(result$line_text, n = 2)
## [[1]]
## [1] "Data Fusion for Correcting Measurement Errors Tracy Schifeling, Jerome P. "                                                                                                    
## [2] "Reiter, Maria DeYoreo<U+2217> arXiv:1610.00147v1 [stat.ME] 1 Oct 2016 Abstract Often in surveys, key items are subject to measurement errors. "                                
## [3] "Given just the data, it can be difficult to determine the distribution of this error process, and hence to obtain accurate inferences that involve the error-prone variables. "
## 
## [[2]]
## [1] "Given just the data, it can be difficult to determine the distribution of this error process, and hence to obtain accurate inferences that involve the error-prone variables. "
## [2] "In some settings, however, analysts have access to a data source on different individuals with high quality measurements of the error-prone survey items. "                    
## [3] "We present a data fusion framework for leveraging this information to improve inferences in the error-prone survey. "
#####Surrounding lines of text
#It may be useful to extract not just the line of text that the keyword is found in, 
#but also surrounding text to have additional context when looking at the keyword results. 
result <- keyword_search(file, 
                         keyword = c('measurement', 'error'),
                         path = TRUE, surround_lines = 1)
head(result)
## # A tibble: 6 x 5
##   keyword     page_num line_num line_text token_text
##   <chr>          <int>    <int> <list>    <list>    
## 1 measurement        1        2 <chr [3]> <list [3]>
## 2 measurement        1        4 <chr [3]> <list [3]>
## 3 measurement        1       10 <chr [3]> <list [3]>
## 4 measurement        1       12 <chr [3]> <list [3]>
## 5 measurement        1       15 <chr [3]> <list [3]>
## 6 measurement        1       17 <chr [3]> <list [3]>
head(result$line_text, n = 2)
## [[1]]
## [1] "Data Fusion for Correcting Measurement Errors Tracy Schifeling, Jerome P. "                                                                                                    
## [2] "Reiter, Maria DeYoreo<U+2217> arXiv:1610.00147v1 [stat.ME] 1 Oct 2016 Abstract Often in surveys, key items are subject to measurement errors. "                                
## [3] "Given just the data, it can be difficult to determine the distribution of this error process, and hence to obtain accurate inferences that involve the error-prone variables. "
## 
## [[2]]
## [1] "Given just the data, it can be difficult to determine the distribution of this error process, and hence to obtain accurate inferences that involve the error-prone variables. "
## [2] "In some settings, however, analysts have access to a data source on different individuals with high quality measurements of the error-prone survey items. "                    
## [3] "We present a data fusion framework for leveraging this information to improve inferences in the error-prone survey. "
#########################Split document into words
token_result <- convert_tokens(file, path = TRUE)[[1]]
head(token_result)
## [[1]]
##   [1] "data"           "fusion"         "for"            "correcting"    
##   [5] "measurement"    "errors"         "tracy"          "schifeling"    
##   [9] "jerome"         "p"              "reiter"         "maria"         
##  [13] "deyoreo"        "arxiv"          "1610.00147v1"   "stat.me"       
##  [17] "1"              "oct"            "2016"           "abstract"      
##  [21] "often"          "in"             "surveys"        "key"           
##  [25] "items"          "are"            "subject"        "to"            
##  [29] "measurement"    "errors"         "given"          "just"          
##  [33] "the"            "data"           "it"             "can"           
##  [37] "be"             "difficult"      "to"             "determine"     
##  [41] "the"            "distribution"   "of"             "this"          
##  [45] "error"          "process"        "and"            "hence"         
##  [49] "to"             "obtain"         "accurate"       "inferences"    
##  [53] "that"           "involve"        "the"            "error"         
##  [57] "prone"          "variables"      "in"             "some"          
##  [61] "settings"       "however"        "analysts"       "have"          
##  [65] "access"         "to"             "a"              "data"          
##  [69] "source"         "on"             "different"      "in"            
##  [73] "dividuals"      "with"           "high"           "quality"       
##  [77] "measurements"   "of"             "the"            "error"         
##  [81] "prone"          "survey"         "items"          "we"            
##  [85] "present"        "a"              "data"           "fusion"        
##  [89] "framework"      "for"            "leveraging"     "this"          
##  [93] "information"    "to"             "improve"        "infer"         
##  [97] "ences"          "in"             "the"            "error"         
## [101] "prone"          "survey"         "the"            "basic"         
## [105] "idea"           "is"             "to"             "posit"         
## [109] "models"         "about"          "the"            "rates"         
## [113] "at"             "which"          "individuals"    "make"          
## [117] "errors"         "coupled"        "with"           "models"        
## [121] "for"            "the"            "values"         "reported"      
## [125] "when"           "errors"         "are"            "made"          
## [129] "this"           "can"            "avoid"          "the"           
## [133] "unrealistic"    "assumption"     "of"             "conditional"   
## [137] "independence"   "typically"      "used"           "in"            
## [141] "data"           "fusion"         "we"             "apply"         
## [145] "the"            "approach"       "on"             "the"           
## [149] "re"             "ported"         "values"         "of"            
## [153] "educational"    "attainments"    "in"             "the"           
## [157] "american"       "community"      "survey"         "using"         
## [161] "the"            "national"       "survey"         "of"            
## [165] "college"        "graduates"      "as"             "the"           
## [169] "high"           "quality"        "data"           "source"        
## [173] "in"             "doing"          "so"             "we"            
## [177] "account"        "for"            "the"            "informative"   
## [181] "sampling"       "design"         "used"           "to"            
## [185] "select"         "the"            "national"       "survey"        
## [189] "of"             "college"        "graduates"      "we"            
## [193] "also"           "present"        "a"              "process"       
## [197] "for"            "assessing"      "the"            "sensitivity"   
## [201] "of"             "various"        "analyses"       "to"            
## [205] "different"      "choices"        "for"            "the"           
## [209] "measurement"    "error"          "models"         "supplemental"  
## [213] "material"       "is"             "available"      "online"        
## [217] "key"            "words"          "fusion"         "imputation"    
## [221] "measurement"    "error"          "missing"        "survey"        
## [225] "this"           "research"       "was"            "supported"     
## [229] "by"             "the"            "national"       "science"       
## [233] "foundation"     "under"          "award"          "ses"           
## [237] "11"             "31897"          "the"            "authors"       
## [241] "wish"           "to"             "thank"          "seth"          
## [245] "sanders"        "for"            "his"            "input"         
## [249] "on"             "informative"    "prior"          "specifications"
## [253] "and"            "mauricio"       "sadinle"        "for"           
## [257] "discussion"     "that"           "improved"       "the"           
## [261] "strategy"       "for"            "accounting"     "for"           
## [265] "the"            "informative"    "sample"         "design"        
## [269] "1"
##############
#ref https://cran.r-project.org/web/packages/pdfsearch/vignettes/intro_to_pdfsearch.html