Import PDF files

The data includes 667 civil cases in Belknap County, BCDD in Concord, and Hillsborough North.

Take caution when openning this text data in the spreadsheet. It will crash the computer

Belknap and BCDD

This data was collected by Sam Martel and Jeramiah Linscott in 2018 summer. There are 72 cases from BCDD (Concord) and 30 cases from Belknap County Superior Court.

# Load readtext package
library(readtext)
library(tidyverse)

# Import files
DATA_DIR <- "C:/Users/sclee1/OneDrive/Documents/research paper/Legal Analytics/data/stateCases/"
data_belknap <- readtext(paste0(DATA_DIR, "SamNJeramiah/Belknap Court/*"))
data_belknap_processed <-
  data_belknap %>%
  mutate(doc_id = str_remove_all(doc_id, "Complaint|.docx"),
         Jurisdiction = "Belknap")

glimpse(data_belknap_processed)
## Observations: 30
## Variables: 3
## $ doc_id       <chr> " 211-2016-CV-00001", " 211-2016-CV-00002", " 211...
## $ text         <chr> "STATE OF NEW HAMPSHIRE JUDICIAL BRANCH BELKNAP, ...
## $ Jurisdiction <chr> "Belknap", "Belknap", "Belknap", "Belknap", "Belk...

data_bcdd <- readtext(paste0(DATA_DIR, "SamNJeramiah/BCDD (Concord Court)/*"))
data_bcdd_processed <-
  data_bcdd %>%
  mutate(doc_id = str_remove_all(doc_id, "Complaint|.docx"),
         Jurisdiction = "Merrimack")

glimpse(data_bcdd_processed)
## Observations: 72
## Variables: 3
## $ doc_id       <chr> " 212-2013-CV-00012", " 215-2011-CV-00537", " 215...
## $ text         <chr> "ALICE FINN \nv. \nBALLENTINE PARTNERS LLC, Succe...
## $ Jurisdiction <chr> "Merrimack", "Merrimack", "Merrimack", "Merrimack...

data_belknapNbcdd <- rbind(data_belknap_processed, data_bcdd_processed)
glimpse(data_belknapNbcdd)
## Observations: 102
## Variables: 3
## $ doc_id       <chr> " 211-2016-CV-00001", " 211-2016-CV-00002", " 211...
## $ text         <chr> "STATE OF NEW HAMPSHIRE JUDICIAL BRANCH BELKNAP, ...
## $ Jurisdiction <chr> "Belknap", "Belknap", "Belknap", "Belknap", "Belk...

Hillsborough North

This data was collected by Cam and Brad Romeo in 2019 summer. There are 575 cases from Hillsborough North.

# Import and clean Hillsborough data that Cam and Brad collected
data_Hills <- readtext(paste0(DATA_DIR, "Cam/Complaint/*"))
data_Hills_processed <-
  data_Hills %>% 
  # Take the first 9 letters of repeated doc_id format: e.g., 2017/0388.docx/0388.docx
  mutate(doc_id = str_sub(doc_id, 1,9)) %>%
  # separate docket number into county, year, type and case
  separate(doc_id, c("year","caseN"), remove = FALSE) %>%
  # Pad one more 0 in front of caseN
  mutate(caseN = str_pad(caseN,width=5, side = "left", pad = "0"),
         type = "CV",
         county = "216",
         Jurisdiction = "HillsN") %>%
  # will need doc_id whose form is consistent with the one in the spreadsheet for merging
  unite(doc_id, c(county,year,type,caseN), sep = "-", remove = FALSE) %>%
  select(doc_id, text, Jurisdiction)

glimpse(data_Hills_processed)
## Observations: 575
## Variables: 3
## $ doc_id       <chr> "216-2016-CV-00022", "216-2016-CV-00026", "216-20...
## $ text         <chr> "THE STATE OF NEW HAMPSHIRESUPERIOR COURT\nHILLSB...
## $ Jurisdiction <chr> "HillsN", "HillsN", "HillsN", "HillsN", "HillsN",...

Merge all three

data_merged <- rbind(data_belknap_processed, data_bcdd_processed, data_Hills_processed)
glimpse(data_merged)
## Observations: 677
## Variables: 3
## $ doc_id       <chr> " 211-2016-CV-00001", " 211-2016-CV-00002", " 211...
## $ text         <chr> "STATE OF NEW HAMPSHIRE JUDICIAL BRANCH BELKNAP, ...
## $ Jurisdiction <chr> "Belknap", "Belknap", "Belknap", "Belknap", "Belk...

data_processed <-
  data_merged %>%
  rename(DoNo = doc_id)

glimpse(data_processed)
## Observations: 677
## Variables: 3
## $ DoNo         <chr> " 211-2016-CV-00001", " 211-2016-CV-00002", " 211...
## $ text         <chr> "STATE OF NEW HAMPSHIRE JUDICIAL BRANCH BELKNAP, ...
## $ Jurisdiction <chr> "Belknap", "Belknap", "Belknap", "Belknap", "Belk...

Unnest and filter texts

A challenge here is that I should remove not only commone English words (e.g., is, his, the) but also legal jargons. I used tf_idf process instead of the common anti_join(stop_words) approach because stop_words doesn’t include legal jargons.

using tf_idf

I used the three filtering mechanisms: 1) stop_words for common English words (i.e., is, and his), 2) inverse document frequency (idf) to control for legal jargons, and 3) minimum number of documents that the word shows up, which is to control for pronouns (i.e., names of the plaintiff).

The analysis below includes the words that satisfy the following three conditions:

  • idf >= 0.5
  • words must appear in at least 20 cases
  • not stop_words from the tidytext package

Note that idf of 0 means that the word shows up in all documents: all 677 cases. Many legal jargans would surely have idf of 0 or a very low number. The higher idf as a filter, the less likely that legal jargons will be filtered out (a stricter standard) but at the risk of excluding potentially important words in the analysis. Another challenge in an analysis like ours is a lot of pronouns (e.g., names of the defendant and the plaintiff) specific to each case. One way to filter pronouns is to set a minimum number of cases that the word shows up. Note that a low number of the minimum number of document requirements may cause many pairs of words with perfect correlation due to the presence of many pronouns. In our data, 10 was too small a number that caused many pairs of perfect correlation. Setting 20 resolved the issue of many pairs of words with perfect correlation.

library(tidytext)

# how many times each court case contains each word
data_word <-
  data_processed %>%
  unnest_tokens(word, text) %>%
  count(DoNo, word, sort = TRUE) %>%
  bind_tf_idf(word, DoNo, n) %>% 
  # filter out legal jargons that appear in almost all cases
  filter(idf >= 0.5) %>% 
  # filter out common English
  anti_join(stop_words) %>%
  # remove numbers
  filter(str_detect(word, "[a-z]")) %>%
  filter(!str_detect(word, "[0-9]"))

# Remove words with small # of occurencies
data_word_filtered <-
  data_word %>%
  # count the word once per case
  distinct(DoNo, word) %>%
  add_count(word) %>%
  filter(n >= 20)

glimpse(data_word_filtered)
## Observations: 150,343
## Variables: 3
## $ DoNo <chr> " 217-2012-CV-00658", "216-2017-CV-00466", "216-2016-CV-0...
## $ word <chr> "plaintiffs", "note", "borrower", "board", "borrower", "b...
## $ n    <int> 315, 106, 42, 65, 42, 42, 42, 51, 51, 42, 51, 54, 48, 51,...

Export data

#write.csv(data_word_filtered, "C:/Users/sclee1/OneDrive/Documents/R/legalAnalytics/data/stateCases_words.csv")

word frequency and correlated pairs

data_word_filtered %>%
  # count one word per case
  distinct(DoNo, word) %>%
  count(word, sort = TRUE) %>%
  head(20) %>%
  mutate(word = fct_reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col(fill = "cornflowerblue") +
  coord_flip() +
  labs(title = "Most Common Words in Complaint Pages",
       subtitle = "Belknap, Merrimack and Hillsborough North",
       y = "Number of cases with the word",
       x = NULL)


# identify correlated word pairs
library(widyr)
top_word_cors <-
  data_word_filtered %>%
  pairwise_cor(word, DoNo, sort = TRUE) %>% 
  head(200)

head(top_word_cors, 50)
item1 item2 correlation
stillman schlee 1.0000000
schlee stillman 1.0000000
mcdowell ofmcdowell 0.9781940
ofmcdowell mcdowell 0.9781940
mailto:ereczek ar.com 0.9781940
ar.com mailto:ereczek 0.9781940
unjustly enriched 0.9753560
enriched unjustly 0.9753560
deeds registry 0.9631695
registry deeds 0.9631695
schlee woburn 0.9621382
stillman woburn 0.9621382
woburn schlee 0.9621382
woburn stillman 0.9621382
schlee o’brien 0.9592920
stillman o’brien 0.9592920
o’brien schlee 0.9592920
o’brien stillman 0.9592920
mailto:ereczek ereczek 0.9592920
topsfield ereczek 0.9592920
ereczek mailto:ereczek 0.9592920
ereczek topsfield 0.9592920
osbumprofessional ofmcdowell 0.9540709
associationp ofmcdowell 0.9540709
ofmcdowell osbumprofessional 0.9540709
ofmcdowell associationp 0.9540709
multiply atm 0.9520058
atm multiply 0.9520058
meruit quantum 0.9484733
quantum meruit 0.9484733
ereczek ar.com 0.9383736
ar.com ereczek 0.9383736
mcdowell osbumprofessional 0.9332664
osbumprofessional mcdowell 0.9332664
associationp mcdowell 0.9332664
mcdowell associationp 0.9332664
inter alia 0.9303669
alia inter 0.9303669
utilization schlee 0.9235314
utilization stillman 0.9235314
schlee utilization 0.9235314
stillman utilization 0.9235314
o’brien woburn 0.9229714
woburn o’brien 0.9229714
registry recorded 0.9143667
recorded registry 0.9143667
topsfield mailto:ereczek 0.9135945
mailto:ereczek topsfield 0.9135945
multiply coupon 0.9109051
coupon multiply 0.9109051

Appendix

# Import and clean Hillsborough data that Cam and Brad collected
data_Hills <- readtext(paste0(DATA_DIR, "Cam/Complaint/*"))
data_Hills_processed <-
  data_Hills %>% 
  # Take the first 9 letters of repeated doc_id format: e.g., 2017/0388.docx/0388.docx
  mutate(doc_id = str_sub(doc_id, 1,9)) %>%
  # separate docket number into county, year, type and case
  separate(doc_id, c("year","caseN"), remove = FALSE) %>%
  # Pad one more 0 in front of caseN
  mutate(caseN = str_pad(caseN,width=5, side = "left", pad = "0"),
         type = "CV",
         county = "216") %>%
  # will need doc_id whose form is consistent with the one in the spreadsheet for merging
  unite(doc_id, c(county,year,type,caseN), sep = "-", remove = FALSE) 

glimpse(data_Hills_processed)
### Using anti_join(stop_words)

library(tidytext)

data_word <-
  data_processed %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words) %>%
  filter(str_detect(word, regex("[a-z]", ignore_case = TRUE))) %>%
  # remove common legal words 
  filter(!word %in% c(""))

data_word %>%
  count(word, sort = TRUE)

# Remove words with small # of occurencies
data_word_filtered <-
  data_word %>%
  # count only once the same words per review
  distinct(doc_id, word) %>%
  add_count(word) %>%
  filter(n >= 50)

head(data_word_filtered)
# a good resource for useful stringr function 
# https://www.brodrigues.co/blog/2018-06-10-scraping_pdfs/
# Load readtext package
library(pdftools)
library(tidyverse)

# Import files
DATA_DIR <- "e:/Court Files/"
data <- pdf_text(paste0(DATA_DIR, "Belknap Court/211-2016-CV-00001.pdf")) 
data
str(data)
cat(data[2])
pdf_data(data)[1]

table <-
  data[1] %>%
  str_split("\n", simplify = TRUE) %>%
  str_squish() 

caseN <- str_which(table, "Case No\\.")
caseN
table[3]
# Load readtext package
library(readtext)
library(tidyverse)

# Import files
DATA_DIR <- "C:/Users/sclee1/OneDrive/Documents/research paper/Legal Analytics/data/stateCases/"
data_belknap <- readtext(paste0(DATA_DIR, "SamNJeramiah/Belknap Court/*"))
glimpse(data_belknap)

data_bcdd <- readtext(paste0(DATA_DIR, "SamNJeramiah/BCDD (Concord Court)/*"))
glimpse(data_bcdd)