#deal with pdf
rm(list = ls())
#install.packages("pdftools")
# Load pdftools
library(pdftools)
## Using poppler version 0.73.0
##############1.1 extract some pages
pdf_subset('v7.pdf',
pages = 68:81, output = "subset.pdf")
## [1] "C:\\Users\\liyix\\OneDrive\\Desktop\\subset.pdf"
# Should say 3
pdf_length("subset.pdf") #[1] 14
## [1] 14
########################1.2 combine
# Generate another pdf
pdf("test.pdf")
plot(mtcars)
dev.off()
## png
## 2
# Combine them with the other one
pdf_combine(c("test.pdf", "subset.pdf"), output = "joined.pdf")
## [1] "C:\\Users\\liyix\\OneDrive\\Desktop\\joined.pdf"
# Should say 4
pdf_length("joined.pdf") #[1] 15
## [1] 15
##################################1.3 combine to one pdf
pdf("myOut.pdf")
for (i in 1:10){
plot(rnorm(10))
}
dev.off()
## png
## 2
###############################search key words
#install.packages("pdfsearch")
library(pdfsearch)
system.file()
## [1] "C:/PROGRA~1/R/R-40~1.2/library/base"
getwd()
## [1] "C:/Users/liyix/OneDrive/Desktop"
file <- 'C:/Users/liyix/OneDrive/Desktop/1610.00147.pdf'
result <- keyword_search(file,
keyword = c('measurement', 'error'),
path = TRUE, surround_lines = 1)
head(result)
## # A tibble: 6 x 5
## keyword page_num line_num line_text token_text
## <chr> <int> <int> <list> <list>
## 1 measurement 1 2 <chr [3]> <list [3]>
## 2 measurement 1 4 <chr [3]> <list [3]>
## 3 measurement 1 10 <chr [3]> <list [3]>
## 4 measurement 1 12 <chr [3]> <list [3]>
## 5 measurement 1 15 <chr [3]> <list [3]>
## 6 measurement 1 17 <chr [3]> <list [3]>
head(result$line_text, n = 2)
## [[1]]
## [1] "Data Fusion for Correcting Measurement Errors Tracy Schifeling, Jerome P. "
## [2] "Reiter, Maria DeYoreo<U+2217> arXiv:1610.00147v1 [stat.ME] 1 Oct 2016 Abstract Often in surveys, key items are subject to measurement errors. "
## [3] "Given just the data, it can be difficult to determine the distribution of this error process, and hence to obtain accurate inferences that involve the error-prone variables. "
##
## [[2]]
## [1] "Given just the data, it can be difficult to determine the distribution of this error process, and hence to obtain accurate inferences that involve the error-prone variables. "
## [2] "In some settings, however, analysts have access to a data source on different individuals with high quality measurements of the error-prone survey items. "
## [3] "We present a data fusion framework for leveraging this information to improve inferences in the error-prone survey. "
#####Surrounding lines of text
#It may be useful to extract not just the line of text that the keyword is found in,
#but also surrounding text to have additional context when looking at the keyword results.
result <- keyword_search(file,
keyword = c('measurement', 'error'),
path = TRUE, surround_lines = 1)
head(result)
## # A tibble: 6 x 5
## keyword page_num line_num line_text token_text
## <chr> <int> <int> <list> <list>
## 1 measurement 1 2 <chr [3]> <list [3]>
## 2 measurement 1 4 <chr [3]> <list [3]>
## 3 measurement 1 10 <chr [3]> <list [3]>
## 4 measurement 1 12 <chr [3]> <list [3]>
## 5 measurement 1 15 <chr [3]> <list [3]>
## 6 measurement 1 17 <chr [3]> <list [3]>
head(result$line_text, n = 2)
## [[1]]
## [1] "Data Fusion for Correcting Measurement Errors Tracy Schifeling, Jerome P. "
## [2] "Reiter, Maria DeYoreo<U+2217> arXiv:1610.00147v1 [stat.ME] 1 Oct 2016 Abstract Often in surveys, key items are subject to measurement errors. "
## [3] "Given just the data, it can be difficult to determine the distribution of this error process, and hence to obtain accurate inferences that involve the error-prone variables. "
##
## [[2]]
## [1] "Given just the data, it can be difficult to determine the distribution of this error process, and hence to obtain accurate inferences that involve the error-prone variables. "
## [2] "In some settings, however, analysts have access to a data source on different individuals with high quality measurements of the error-prone survey items. "
## [3] "We present a data fusion framework for leveraging this information to improve inferences in the error-prone survey. "
#########################Split document into words
token_result <- convert_tokens(file, path = TRUE)[[1]]
head(token_result)
## [[1]]
## [1] "data" "fusion" "for" "correcting"
## [5] "measurement" "errors" "tracy" "schifeling"
## [9] "jerome" "p" "reiter" "maria"
## [13] "deyoreo" "arxiv" "1610.00147v1" "stat.me"
## [17] "1" "oct" "2016" "abstract"
## [21] "often" "in" "surveys" "key"
## [25] "items" "are" "subject" "to"
## [29] "measurement" "errors" "given" "just"
## [33] "the" "data" "it" "can"
## [37] "be" "difficult" "to" "determine"
## [41] "the" "distribution" "of" "this"
## [45] "error" "process" "and" "hence"
## [49] "to" "obtain" "accurate" "inferences"
## [53] "that" "involve" "the" "error"
## [57] "prone" "variables" "in" "some"
## [61] "settings" "however" "analysts" "have"
## [65] "access" "to" "a" "data"
## [69] "source" "on" "different" "in"
## [73] "dividuals" "with" "high" "quality"
## [77] "measurements" "of" "the" "error"
## [81] "prone" "survey" "items" "we"
## [85] "present" "a" "data" "fusion"
## [89] "framework" "for" "leveraging" "this"
## [93] "information" "to" "improve" "infer"
## [97] "ences" "in" "the" "error"
## [101] "prone" "survey" "the" "basic"
## [105] "idea" "is" "to" "posit"
## [109] "models" "about" "the" "rates"
## [113] "at" "which" "individuals" "make"
## [117] "errors" "coupled" "with" "models"
## [121] "for" "the" "values" "reported"
## [125] "when" "errors" "are" "made"
## [129] "this" "can" "avoid" "the"
## [133] "unrealistic" "assumption" "of" "conditional"
## [137] "independence" "typically" "used" "in"
## [141] "data" "fusion" "we" "apply"
## [145] "the" "approach" "on" "the"
## [149] "re" "ported" "values" "of"
## [153] "educational" "attainments" "in" "the"
## [157] "american" "community" "survey" "using"
## [161] "the" "national" "survey" "of"
## [165] "college" "graduates" "as" "the"
## [169] "high" "quality" "data" "source"
## [173] "in" "doing" "so" "we"
## [177] "account" "for" "the" "informative"
## [181] "sampling" "design" "used" "to"
## [185] "select" "the" "national" "survey"
## [189] "of" "college" "graduates" "we"
## [193] "also" "present" "a" "process"
## [197] "for" "assessing" "the" "sensitivity"
## [201] "of" "various" "analyses" "to"
## [205] "different" "choices" "for" "the"
## [209] "measurement" "error" "models" "supplemental"
## [213] "material" "is" "available" "online"
## [217] "key" "words" "fusion" "imputation"
## [221] "measurement" "error" "missing" "survey"
## [225] "this" "research" "was" "supported"
## [229] "by" "the" "national" "science"
## [233] "foundation" "under" "award" "ses"
## [237] "11" "31897" "the" "authors"
## [241] "wish" "to" "thank" "seth"
## [245] "sanders" "for" "his" "input"
## [249] "on" "informative" "prior" "specifications"
## [253] "and" "mauricio" "sadinle" "for"
## [257] "discussion" "that" "improved" "the"
## [261] "strategy" "for" "accounting" "for"
## [265] "the" "informative" "sample" "design"
## [269] "1"
##############
#ref https://cran.r-project.org/web/packages/pdfsearch/vignettes/intro_to_pdfsearch.html