Required Packages

library(tidyverse)
── Attaching core tidyverse packages ─────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.2     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.3     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     ── Conflicts ───────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(pdftools)
Using poppler version 22.02.0
library(vctrs)

Attaching package: ‘vctrs’

The following object is masked from ‘package:dplyr’:

    data_frame

The following object is masked from ‘package:tibble’:

    data_frame

The following code will act as a quick reference on how to scrape tables from PDF’s using the pdftools library in R. I am using the school budget books for DC Public Schools and am interested in gathering data just on the total budget and the Full Time equivalents for all of the schools. Prior to scraping the data, due to the amount of extra data and number of pages in each budget book, I did a decent amount of preprocessing on the PDF’s prior to working with them. Specifically, I cropped them twice to create two seperate PDF’s. One had just the names for the schools, the other had just the relevent tables. This simlified the scraping process, as the budget books were not fully standardized in terms of how they structured the tables and info.

School Budgets

Book 1

I am first looking to capture the data related to the school budgets only.

book_1 <- pdf_text("budget_1.pdf")
# Split list by lines, creating a vector for each line
book_1 <- strsplit(book_1, split = "\n")
# Cpature the relevent data accross all schools
book_1 <- lapply(book_1, head, 4)
book_1 <- lapply(book_1, tail, 3)
book_1 <- lapply(book_1, str_remove, ".*:")
book_1 <- lapply(book_1, tibble)
# map through all of the schools, adjust variables
book_1 <- map_df(1:length(book_1), ~ {
  book_1[[.x]] %>%
    rename(budget ="<chr>") %>% 
     mutate(
    year = 2018:2020, 
    budget = str_remove(budget, ","), 
    budget = as.numeric(budget) * 1000
  ) 
})


# Get School Names
school_names <- pdf_text("names_1.pdf")
# Split list by lines
school_names <- strsplit(school_names, "\n")
# Capture Names 
school_names <- lapply(school_names, head, 1)
school_names <- lapply(school_names, trimws)
school_names <- lapply(school_names, rep, 3)
# Create a single  DF 
school_names <- as.data.frame(do.call(rbind, school_names))
school_names <- school_names %>% pivot_longer(cols = 1:3, names_to = "v", 
                                              values_to = "schools") %>% 
  select(-v)
budget_1 <- bind_cols(book_1, school_names)

Book 2

# upload budget
book_2 <- pdf_text("budget_2.pdf")
# Split list by lines, creating a vector for each line
book_2 <- strsplit(book_2, split = "\n")
# Cpature the relevent data accross all schools
book_2 <- lapply(book_2, head, 4)
book_2 <- lapply(book_2, tail, 3)
book_2 <- lapply(book_2, str_remove, ".*:")
book_2 <- lapply(book_2, tibble)
# map through all of the schools, adjust variables
book_2 <- map_df(1:length(book_2), ~ {
  book_2[[.x]] %>%
    rename(budget ="<chr>") %>% 
     mutate(
    year = 2021:2023, 
    budget = str_remove(budget, ","), 
    budget = as.numeric(budget) * 1000
  ) 
})

# Get School Names
school_names <- pdf_text("names_2.pdf")
# Split list by lines
school_names <- strsplit(school_names, "\n")
# Capture Names 
school_names <- lapply(school_names, head, 1)
school_names <- lapply(school_names, trimws)
school_names <- lapply(school_names, rep, 3)
# Create a single  DF 
school_names <- as.data.frame(do.call(rbind, school_names))
school_names <- school_names %>% pivot_longer(cols = 1:3, names_to = "v", 
                                              values_to = "schools") %>% 
  select(-v)
budget_2 <- bind_cols(book_2, school_names)

Combine DF

# Pivot the DFs
budget_1 <- budget_1 %>% pivot_wider(
  names_from = year, values_from = budget
)
budget_2 <- budget_2 %>% pivot_wider(
  names_from = year, values_from = budget
) %>% arrange(schools)
# Standardize Schools 
## High Schools
budget_1$schools <- str_replace_all(budget_1$schools, "High School", "HS")
## Middle Schools
budget_1$schools <- str_replace_all(budget_1$schools, "Middle School", "MS")
## Elementary Schools
budget_1$schools <- str_replace_all(budget_1$schools, "Elementary School", "ES")
## Education Campus
budget_1$schools <- str_replace_all(budget_1$schools, "Education Campus", "EC")
df <- full_join(budget_1, budget_2, by = "schools")
df %>% arrange(schools) %>% writexl::write_xlsx("school_budgets.xlsx")
writexl::write_xlsx(list(
  budget_1 = budget_1, 
  budget_2 = budget_2
), "school_budgets.xlsx")

Full time equivalents

Budget 1

# Load in PDF
ft_1 <- pdf_text("test.pdf")
# Split into Vectors
ft_1 <- strsplit(ft_1, "\n")
ft_1 <- lapply(ft_1, trimws)
# Filter across all elements of the large list with a lambda function
ft_1 <- lapply(ft_1, function(x)
  x[str_detect(x,"^Total   ")])
# ft_1 Empty Lists 
ft_1 <- list_drop_empty(ft_1)
# Create Data Frame, select columns of interest
ft_1 <- map_dfr(ft_1, ~{
  data.frame(str_split_fixed(.x, " {1,}", 11))
}) %>% select(X2, X3, X4, X7, X8, X9) %>% 
  rename(budget_2018 = X2, 
         budget_2019 = X3, 
         budget_2020 = X4,
         ft_2018 = X7, 
         ft_2019 = X8, 
         ft_2020 = X9) %>% 
  pivot_longer(
    cols = 1:6,
    names_to = c("type", "year"), names_sep = "_", values_to = "value"
  ) %>% 
  mutate(
    value = str_remove(value, ","), 
    value = as.numeric(value), 
    value = case_when(
      type == "budget" ~ value * 1000, 
      T ~ value
    )
  )
# Add in School Names
# Get School Names
school_names <- pdf_text("names_1.pdf")
# Split list by lines
school_names <- strsplit(school_names, "\n")
# Capture Names 
school_names <- lapply(school_names, head, 1)
school_names <- lapply(school_names, trimws)
school_names <- lapply(school_names, rep, 6)
# Create a single  DF 
school_names <- as.data.frame(do.call(rbind, school_names))
school_names <- school_names %>% pivot_longer(cols = 1:6, names_to = "v", 
                                              values_to = "schools") %>% 
  select(-v)
ft_1 <- bind_cols(ft_1, school_names)

Budget 2

# Load in PDF
ft_2 <- pdf_text("ft_2.pdf")
# Split into Vectors
ft_2 <- strsplit(ft_2, "\n")
ft_2 <- lapply(ft_2, trimws)
# Filter across all elements of the large list with a lambda function
ft_2 <- lapply(ft_2, function(x)
  x[str_detect(x,"^Total   ")])

# ft_1 Empty Lists 
ft_2 <- list_drop_empty(ft_2)
# Create Data Frame, select columns of interest
ft_2 <- map_dfr(ft_2, ~{
  data.frame(str_split_fixed(.x, " {1,}", 11))
}) %>%
  select(X2, X3, X4, X7, X8, X9) %>% 
  rename(budget_2021 = X2, 
         budget_2022 = X3, 
         budget_2023 = X4,
         ft_2021 = X7, 
         ft_2022 = X8, 
         ft_2023 = X9) %>% 
  pivot_longer(
    cols = 1:6,
    names_to = c("type", "year"), names_sep = "_", values_to = "value"
  ) %>% 
  mutate(
    value = str_remove(value, ","), 
    value = as.numeric(value), 
    value = case_when(
      type == "budget" ~ value * 1000, 
      T ~ value
    )
  )

# Get School Names
school_names <- pdf_text("names_2.pdf")
# Split list by lines
school_names <- strsplit(school_names, "\n")
# Capture Names 
school_names <- lapply(school_names, head, 1)
school_names <- lapply(school_names, trimws)
school_names <- lapply(school_names, rep, 6)
# Create a single  DF 
school_names <- as.data.frame(do.call(rbind, school_names))
school_names <- school_names %>% pivot_longer(cols = 1:6, names_to = "v", 
                                              values_to = "schools") %>% 
  select(-v)
ft_2 <- bind_cols(ft_2, school_names)

Combine DF

# Pivot Df
ft_1 <- ft_1 %>% 
  pivot_wider(names_from = c(type, year), values_from = value)
ft_2 <- ft_2 %>% 
  pivot_wider(names_from = c(type, year), values_from = value)
# Standardize Schools 
## High Schools
ft_1$schools <- str_replace_all(ft_1$schools, "High School", "HS")
## Middle Schools
ft_1$schools <- str_replace_all(ft_1$schools, "Middle School", "MS")
## Elementary Schools
ft_1$schools <- str_replace_all(ft_1$schools, "Elementary School", "ES")
## Education Campus
ft_1$schools <- str_replace_all(ft_1$schools, "Education Campus", "EC")
df <- full_join(ft_1, ft_2, by = c("schools"))
df %>% arrange(schools) %>%  writexl::write_xlsx("school_budgets.xlsx")
---
title: "Scraping Tables from PDF's using pdftools"
author: Nick Dodds
output: html_notebook
---
# Required Packages
```{r echo=TRUE, message=FALSE, warning=FALSE}
library(tidyverse)
library(pdftools)
```
The following code will act as a quick reference on how to scrape tables from 
PDF's using the pdftools library in R. I am using the school budget books for 
DC Public Schools and am interested in gathering data just on the total budget
and the Full Time equivalents for all of the schools. 
Prior to scraping the data, due to the amount of extra data and number of 
pages in each budget book, I did a decent amount of preprocessing on the PDF's 
prior to working with them. Specifically, I cropped them twice to create 
two seperate PDF's. One had just the names for the schools, the other had just 
the relevent tables. This simlified the scraping process, as the budget books 
were not fully standardized in terms of how they structured the tables and info. 

# School Budgets 
## Book 1
I am first looking to capture the data related to the school budgets only.
```{r echo=TRUE}
book_1 <- pdf_text("budget_1.pdf")
# Split list by lines, creating a vector for each line
book_1 <- strsplit(book_1, split = "\n")
# Cpature the relevent data accross all schools
book_1 <- lapply(book_1, head, 4)
book_1 <- lapply(book_1, tail, 3)
book_1 <- lapply(book_1, str_remove, ".*:")
book_1 <- lapply(book_1, tibble)
# map through all of the schools, adjust variables
book_1 <- map_df(1:length(book_1), ~ {
  book_1[[.x]] %>%
    rename(budget ="<chr>") %>% 
     mutate(
    year = 2018:2020, 
    budget = str_remove(budget, ","), 
    budget = as.numeric(budget) * 1000
  ) 
})


# Get School Names
school_names <- pdf_text("names_1.pdf")
# Split list by lines
school_names <- strsplit(school_names, "\n")
# Capture Names 
school_names <- lapply(school_names, head, 1)
school_names <- lapply(school_names, trimws)
school_names <- lapply(school_names, rep, 3)
# Create a single  DF 
school_names <- as.data.frame(do.call(rbind, school_names))
school_names <- school_names %>% pivot_longer(cols = 1:3, names_to = "v", 
                                              values_to = "schools") %>% 
  select(-v)
budget_1 <- bind_cols(book_1, school_names)
```

## Book 2
```{r echo=TRUE}
# upload budget
book_2 <- pdf_text("budget_2.pdf")
# Split list by lines, creating a vector for each line
book_2 <- strsplit(book_2, split = "\n")
# Cpature the relevent data accross all schools
book_2 <- lapply(book_2, head, 4)
book_2 <- lapply(book_2, tail, 3)
book_2 <- lapply(book_2, str_remove, ".*:")
book_2 <- lapply(book_2, tibble)
# map through all of the schools, adjust variables
book_2 <- map_df(1:length(book_2), ~ {
  book_2[[.x]] %>%
    rename(budget ="<chr>") %>% 
     mutate(
    year = 2021:2023, 
    budget = str_remove(budget, ","), 
    budget = as.numeric(budget) * 1000
  ) 
})

# Get School Names
school_names <- pdf_text("names_2.pdf")
# Split list by lines
school_names <- strsplit(school_names, "\n")
# Capture Names 
school_names <- lapply(school_names, head, 1)
school_names <- lapply(school_names, trimws)
school_names <- lapply(school_names, rep, 3)
# Create a single  DF 
school_names <- as.data.frame(do.call(rbind, school_names))
school_names <- school_names %>% pivot_longer(cols = 1:3, names_to = "v", 
                                              values_to = "schools") %>% 
  select(-v)
budget_2 <- bind_cols(book_2, school_names)
```
##  Combine DF
```{r echo=TRUE}
# Pivot the DFs
budget_1 <- budget_1 %>% pivot_wider(
  names_from = year, values_from = budget
)
budget_2 <- budget_2 %>% pivot_wider(
  names_from = year, values_from = budget
) %>% arrange(schools)
# Standardize Schools 
## High Schools
budget_1$schools <- str_replace_all(budget_1$schools, "High School", "HS")
## Middle Schools
budget_1$schools <- str_replace_all(budget_1$schools, "Middle School", "MS")
## Elementary Schools
budget_1$schools <- str_replace_all(budget_1$schools, "Elementary School", "ES")
## Education Campus
budget_1$schools <- str_replace_all(budget_1$schools, "Education Campus", "EC")
df <- full_join(budget_1, budget_2, by = "schools")
df %>% arrange(schools) %>% writexl::write_xlsx("school_budgets.xlsx")
writexl::write_xlsx(list(
  budget_1 = budget_1, 
  budget_2 = budget_2
), "school_budgets.xlsx")
```
# Full time equivalents
## Budget 1
```{r echo=TRUE}
# Load in PDF
ft_1 <- pdf_text("test.pdf")
# Split into Vectors
ft_1 <- strsplit(ft_1, "\n")
ft_1 <- lapply(ft_1, trimws)
# Filter across all elements of the large list with a lambda function
ft_1 <- lapply(ft_1, function(x)
  x[str_detect(x,"^Total   ")])
# ft_1 Empty Lists 
ft_1 <- list_drop_empty(ft_1)
# Create Data Frame, select columns of interest
ft_1 <- map_dfr(ft_1, ~{
  data.frame(str_split_fixed(.x, " {1,}", 11))
}) %>% select(X2, X3, X4, X7, X8, X9) %>% 
  rename(budget_2018 = X2, 
         budget_2019 = X3, 
         budget_2020 = X4,
         ft_2018 = X7, 
         ft_2019 = X8, 
         ft_2020 = X9) %>% 
  pivot_longer(
    cols = 1:6,
    names_to = c("type", "year"), names_sep = "_", values_to = "value"
  ) %>% 
  mutate(
    value = str_remove(value, ","), 
    value = as.numeric(value), 
    value = case_when(
      type == "budget" ~ value * 1000, 
      T ~ value
    )
  )
# Add in School Names
# Get School Names
school_names <- pdf_text("names_1.pdf")
# Split list by lines
school_names <- strsplit(school_names, "\n")
# Capture Names 
school_names <- lapply(school_names, head, 1)
school_names <- lapply(school_names, trimws)
school_names <- lapply(school_names, rep, 6)
# Create a single  DF 
school_names <- as.data.frame(do.call(rbind, school_names))
school_names <- school_names %>% pivot_longer(cols = 1:6, names_to = "v", 
                                              values_to = "schools") %>% 
  select(-v)
ft_1 <- bind_cols(ft_1, school_names)
```
## Budget 2
```{r echo=TRUE}
# Load in PDF
ft_2 <- pdf_text("ft_2.pdf")
# Split into Vectors
ft_2 <- strsplit(ft_2, "\n")
ft_2 <- lapply(ft_2, trimws)
# Filter across all elements of the large list with a lambda function
ft_2 <- lapply(ft_2, function(x)
  x[str_detect(x,"^Total   ")])

# ft_1 Empty Lists 
ft_2 <- list_drop_empty(ft_2)
# Create Data Frame, select columns of interest
ft_2 <- map_dfr(ft_2, ~{
  data.frame(str_split_fixed(.x, " {1,}", 11))
}) %>%
  select(X2, X3, X4, X7, X8, X9) %>% 
  rename(budget_2021 = X2, 
         budget_2022 = X3, 
         budget_2023 = X4,
         ft_2021 = X7, 
         ft_2022 = X8, 
         ft_2023 = X9) %>% 
  pivot_longer(
    cols = 1:6,
    names_to = c("type", "year"), names_sep = "_", values_to = "value"
  ) %>% 
  mutate(
    value = str_remove(value, ","), 
    value = as.numeric(value), 
    value = case_when(
      type == "budget" ~ value * 1000, 
      T ~ value
    )
  )

# Get School Names
school_names <- pdf_text("names_2.pdf")
# Split list by lines
school_names <- strsplit(school_names, "\n")
# Capture Names 
school_names <- lapply(school_names, head, 1)
school_names <- lapply(school_names, trimws)
school_names <- lapply(school_names, rep, 6)
# Create a single  DF 
school_names <- as.data.frame(do.call(rbind, school_names))
school_names <- school_names %>% pivot_longer(cols = 1:6, names_to = "v", 
                                              values_to = "schools") %>% 
  select(-v)
ft_2 <- bind_cols(ft_2, school_names)
```

##  Combine DF 
```{r echo=TRUE}
# Pivot Df
ft_1 <- ft_1 %>% 
  pivot_wider(names_from = c(type, year), values_from = value)
ft_2 <- ft_2 %>% 
  pivot_wider(names_from = c(type, year), values_from = value)
# Standardize Schools 
## High Schools
ft_1$schools <- str_replace_all(ft_1$schools, "High School", "HS")
## Middle Schools
ft_1$schools <- str_replace_all(ft_1$schools, "Middle School", "MS")
## Elementary Schools
ft_1$schools <- str_replace_all(ft_1$schools, "Elementary School", "ES")
## Education Campus
ft_1$schools <- str_replace_all(ft_1$schools, "Education Campus", "EC")
df <- full_join(ft_1, ft_2, by = c("schools"))
df %>% arrange(schools) %>%  writexl::write_xlsx("school_budgets.xlsx")
```

