Civil Cases of New Hampshire Businesses

Import PDF files
- Belknap and BCDD
- Hillsborough North
Merge all three
Unnest and filter texts
- using tf_idf
Export data
word frequency and correlated pairs
Appendix

Import PDF files

The data includes 667 civil cases in Belknap County, BCDD in Concord, and Hillsborough North.

Take caution when openning this text data in the spreadsheet. It will crash the computer

Belknap and BCDD

This data was collected by Sam Martel and Jeramiah Linscott in 2018 summer. There are 72 cases from BCDD (Concord) and 30 cases from Belknap County Superior Court.

# Load readtext package
library(readtext)
library(tidyverse)

# Import files
DATA_DIR <- "C:/Users/sclee1/OneDrive/Documents/research paper/Legal Analytics/data/stateCases/"
data_belknap <- readtext(paste0(DATA_DIR, "SamNJeramiah/Belknap Court/*"))
data_belknap_processed <-
  data_belknap %>%
  mutate(doc_id = str_remove_all(doc_id, "Complaint|.docx"),
         Jurisdiction = "Belknap")

glimpse(data_belknap_processed)
## Observations: 30
## Variables: 3
## $ doc_id       <chr> " 211-2016-CV-00001", " 211-2016-CV-00002", " 211...
## $ text         <chr> "STATE OF NEW HAMPSHIRE JUDICIAL BRANCH BELKNAP, ...
## $ Jurisdiction <chr> "Belknap", "Belknap", "Belknap", "Belknap", "Belk...

data_bcdd <- readtext(paste0(DATA_DIR, "SamNJeramiah/BCDD (Concord Court)/*"))
data_bcdd_processed <-
  data_bcdd %>%
  mutate(doc_id = str_remove_all(doc_id, "Complaint|.docx"),
         Jurisdiction = "Merrimack")

glimpse(data_bcdd_processed)
## Observations: 72
## Variables: 3
## $ doc_id       <chr> " 212-2013-CV-00012", " 215-2011-CV-00537", " 215...
## $ text         <chr> "ALICE FINN \nv. \nBALLENTINE PARTNERS LLC, Succe...
## $ Jurisdiction <chr> "Merrimack", "Merrimack", "Merrimack", "Merrimack...

data_belknapNbcdd <- rbind(data_belknap_processed, data_bcdd_processed)
glimpse(data_belknapNbcdd)
## Observations: 102
## Variables: 3
## $ doc_id       <chr> " 211-2016-CV-00001", " 211-2016-CV-00002", " 211...
## $ text         <chr> "STATE OF NEW HAMPSHIRE JUDICIAL BRANCH BELKNAP, ...
## $ Jurisdiction <chr> "Belknap", "Belknap", "Belknap", "Belknap", "Belk...

Hillsborough North

This data was collected by Cam and Brad Romeo in 2019 summer. There are 575 cases from Hillsborough North.

# Import and clean Hillsborough data that Cam and Brad collected
data_Hills <- readtext(paste0(DATA_DIR, "Cam/Complaint/*"))
data_Hills_processed <-
  data_Hills %>% 
  # Take the first 9 letters of repeated doc_id format: e.g., 2017/0388.docx/0388.docx
  mutate(doc_id = str_sub(doc_id, 1,9)) %>%
  # separate docket number into county, year, type and case
  separate(doc_id, c("year","caseN"), remove = FALSE) %>%
  # Pad one more 0 in front of caseN
  mutate(caseN = str_pad(caseN,width=5, side = "left", pad = "0"),
         type = "CV",
         county = "216",
         Jurisdiction = "HillsN") %>%
  # will need doc_id whose form is consistent with the one in the spreadsheet for merging
  unite(doc_id, c(county,year,type,caseN), sep = "-", remove = FALSE) %>%
  select(doc_id, text, Jurisdiction)

glimpse(data_Hills_processed)
## Observations: 575
## Variables: 3
## $ doc_id       <chr> "216-2016-CV-00022", "216-2016-CV-00026", "216-20...
## $ text         <chr> "THE STATE OF NEW HAMPSHIRESUPERIOR COURT\nHILLSB...
## $ Jurisdiction <chr> "HillsN", "HillsN", "HillsN", "HillsN", "HillsN",...

Merge all three

data_merged <- rbind(data_belknap_processed, data_bcdd_processed, data_Hills_processed)
glimpse(data_merged)
## Observations: 677
## Variables: 3
## $ doc_id       <chr> " 211-2016-CV-00001", " 211-2016-CV-00002", " 211...
## $ text         <chr> "STATE OF NEW HAMPSHIRE JUDICIAL BRANCH BELKNAP, ...
## $ Jurisdiction <chr> "Belknap", "Belknap", "Belknap", "Belknap", "Belk...

data_processed <-
  data_merged %>%
  rename(DoNo = doc_id)

glimpse(data_processed)
## Observations: 677
## Variables: 3
## $ DoNo         <chr> " 211-2016-CV-00001", " 211-2016-CV-00002", " 211...
## $ text         <chr> "STATE OF NEW HAMPSHIRE JUDICIAL BRANCH BELKNAP, ...
## $ Jurisdiction <chr> "Belknap", "Belknap", "Belknap", "Belknap", "Belk...

Unnest and filter texts

A challenge here is that I should remove not only commone English words (e.g., is, his, the) but also legal jargons. I used tf_idf process instead of the common anti_join(stop_words) approach because stop_words doesn’t include legal jargons.

using tf_idf

I used the three filtering mechanisms: 1) stop_words for common English words (i.e., is, and his), 2) inverse document frequency (idf) to control for legal jargons, and 3) minimum number of documents that the word shows up, which is to control for pronouns (i.e., names of the plaintiff).

The analysis below includes the words that satisfy the following three conditions:

idf >= 0.5
words must appear in at least 20 cases
not stop_words from the tidytext package

Note that idf of 0 means that the word shows up in all documents: all 677 cases. Many legal jargans would surely have idf of 0 or a very low number. The higher idf as a filter, the less likely that legal jargons will be filtered out (a stricter standard) but at the risk of excluding potentially important words in the analysis. Another challenge in an analysis like ours is a lot of pronouns (e.g., names of the defendant and the plaintiff) specific to each case. One way to filter pronouns is to set a minimum number of cases that the word shows up. Note that a low number of the minimum number of document requirements may cause many pairs of words with perfect correlation due to the presence of many pronouns. In our data, 10 was too small a number that caused many pairs of perfect correlation. Setting 20 resolved the issue of many pairs of words with perfect correlation.

library(tidytext)

# how many times each court case contains each word
data_word <-
  data_processed %>%
  unnest_tokens(word, text) %>%
  count(DoNo, word, sort = TRUE) %>%
  bind_tf_idf(word, DoNo, n) %>% 
  # filter out legal jargons that appear in almost all cases
  filter(idf >= 0.5) %>% 
  # filter out common English
  anti_join(stop_words) %>%
  # remove numbers
  filter(str_detect(word, "[a-z]")) %>%
  filter(!str_detect(word, "[0-9]"))

# Remove words with small # of occurencies
data_word_filtered <-
  data_word %>%
  # count the word once per case
  distinct(DoNo, word) %>%
  add_count(word) %>%
  filter(n >= 20)

glimpse(data_word_filtered)
## Observations: 150,343
## Variables: 3
## $ DoNo <chr> " 217-2012-CV-00658", "216-2017-CV-00466", "216-2016-CV-0...
## $ word <chr> "plaintiffs", "note", "borrower", "board", "borrower", "b...
## $ n    <int> 315, 106, 42, 65, 42, 42, 42, 51, 51, 42, 51, 54, 48, 51,...

Export data

#write.csv(data_word_filtered, "C:/Users/sclee1/OneDrive/Documents/R/legalAnalytics/data/stateCases_words.csv")

word frequency and correlated pairs

data_word_filtered %>%
  # count one word per case
  distinct(DoNo, word) %>%
  count(word, sort = TRUE) %>%
  head(20) %>%
  mutate(word = fct_reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col(fill = "cornflowerblue") +
  coord_flip() +
  labs(title = "Most Common Words in Complaint Pages",
       subtitle = "Belknap, Merrimack and Hillsborough North",
       y = "Number of cases with the word",
       x = NULL)


# identify correlated word pairs
library(widyr)
top_word_cors <-
  data_word_filtered %>%
  pairwise_cor(word, DoNo, sort = TRUE) %>% 
  head(200)

head(top_word_cors, 50)

item1	item2	correlation
stillman	schlee	1.0000000
schlee	stillman	1.0000000
mcdowell	ofmcdowell	0.9781940
ofmcdowell	mcdowell	0.9781940
mailto:ereczek	ar.com	0.9781940
ar.com	mailto:ereczek	0.9781940
unjustly	enriched	0.9753560
enriched	unjustly	0.9753560
deeds	registry	0.9631695
registry	deeds	0.9631695
schlee	woburn	0.9621382
stillman	woburn	0.9621382
woburn	schlee	0.9621382
woburn	stillman	0.9621382
schlee	o’brien	0.9592920
stillman	o’brien	0.9592920
o’brien	schlee	0.9592920
o’brien	stillman	0.9592920
mailto:ereczek	ereczek	0.9592920
topsfield	ereczek	0.9592920
ereczek	mailto:ereczek	0.9592920
ereczek	topsfield	0.9592920
osbumprofessional	ofmcdowell	0.9540709
associationp	ofmcdowell	0.9540709
ofmcdowell	osbumprofessional	0.9540709
ofmcdowell	associationp	0.9540709
multiply	atm	0.9520058
atm	multiply	0.9520058
meruit	quantum	0.9484733
quantum	meruit	0.9484733
ereczek	ar.com	0.9383736
ar.com	ereczek	0.9383736
mcdowell	osbumprofessional	0.9332664
osbumprofessional	mcdowell	0.9332664
associationp	mcdowell	0.9332664
mcdowell	associationp	0.9332664
inter	alia	0.9303669
alia	inter	0.9303669
utilization	schlee	0.9235314
utilization	stillman	0.9235314
schlee	utilization	0.9235314
stillman	utilization	0.9235314
o’brien	woburn	0.9229714
woburn	o’brien	0.9229714
registry	recorded	0.9143667
recorded	registry	0.9143667
topsfield	mailto:ereczek	0.9135945
mailto:ereczek	topsfield	0.9135945
multiply	coupon	0.9109051
coupon	multiply	0.9109051

Appendix

# Import and clean Hillsborough data that Cam and Brad collected
data_Hills <- readtext(paste0(DATA_DIR, "Cam/Complaint/*"))
data_Hills_processed <-
  data_Hills %>% 
  # Take the first 9 letters of repeated doc_id format: e.g., 2017/0388.docx/0388.docx
  mutate(doc_id = str_sub(doc_id, 1,9)) %>%
  # separate docket number into county, year, type and case
  separate(doc_id, c("year","caseN"), remove = FALSE) %>%
  # Pad one more 0 in front of caseN
  mutate(caseN = str_pad(caseN,width=5, side = "left", pad = "0"),
         type = "CV",
         county = "216") %>%
  # will need doc_id whose form is consistent with the one in the spreadsheet for merging
  unite(doc_id, c(county,year,type,caseN), sep = "-", remove = FALSE) 

glimpse(data_Hills_processed)

### Using anti_join(stop_words)

library(tidytext)

data_word <-
  data_processed %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words) %>%
  filter(str_detect(word, regex("[a-z]", ignore_case = TRUE))) %>%
  # remove common legal words 
  filter(!word %in% c(""))

data_word %>%
  count(word, sort = TRUE)

# Remove words with small # of occurencies
data_word_filtered <-
  data_word %>%
  # count only once the same words per review
  distinct(doc_id, word) %>%
  add_count(word) %>%
  filter(n >= 50)

head(data_word_filtered)

# a good resource for useful stringr function 
# https://www.brodrigues.co/blog/2018-06-10-scraping_pdfs/
# Load readtext package
library(pdftools)
library(tidyverse)

# Import files
DATA_DIR <- "e:/Court Files/"
data <- pdf_text(paste0(DATA_DIR, "Belknap Court/211-2016-CV-00001.pdf")) 
data
str(data)
cat(data[2])
pdf_data(data)[1]

table <-
  data[1] %>%
  str_split("\n", simplify = TRUE) %>%
  str_squish() 

caseN <- str_which(table, "Case No\\.")
caseN
table[3]

# Load readtext package
library(readtext)
library(tidyverse)

# Import files
DATA_DIR <- "C:/Users/sclee1/OneDrive/Documents/research paper/Legal Analytics/data/stateCases/"
data_belknap <- readtext(paste0(DATA_DIR, "SamNJeramiah/Belknap Court/*"))
glimpse(data_belknap)

data_bcdd <- readtext(paste0(DATA_DIR, "SamNJeramiah/BCDD (Concord Court)/*"))
glimpse(data_bcdd)