The data includes 667 civil cases in Belknap County, BCDD in Concord, and Hillsborough North.
Take caution when openning this text data in the spreadsheet. It will crash the computer
This data was collected by Sam Martel and Jeramiah Linscott in 2018 summer. There are 72 cases from BCDD (Concord) and 30 cases from Belknap County Superior Court.
# Load readtext package
library(readtext)
library(tidyverse)
# Import files
DATA_DIR <- "C:/Users/sclee1/OneDrive/Documents/research paper/Legal Analytics/data/stateCases/"
data_belknap <- readtext(paste0(DATA_DIR, "SamNJeramiah/Belknap Court/*"))
data_belknap_processed <-
data_belknap %>%
mutate(doc_id = str_remove_all(doc_id, "Complaint|.docx"),
Jurisdiction = "Belknap")
glimpse(data_belknap_processed)
## Observations: 30
## Variables: 3
## $ doc_id <chr> " 211-2016-CV-00001", " 211-2016-CV-00002", " 211...
## $ text <chr> "STATE OF NEW HAMPSHIRE JUDICIAL BRANCH BELKNAP, ...
## $ Jurisdiction <chr> "Belknap", "Belknap", "Belknap", "Belknap", "Belk...
data_bcdd <- readtext(paste0(DATA_DIR, "SamNJeramiah/BCDD (Concord Court)/*"))
data_bcdd_processed <-
data_bcdd %>%
mutate(doc_id = str_remove_all(doc_id, "Complaint|.docx"),
Jurisdiction = "Merrimack")
glimpse(data_bcdd_processed)
## Observations: 72
## Variables: 3
## $ doc_id <chr> " 212-2013-CV-00012", " 215-2011-CV-00537", " 215...
## $ text <chr> "ALICE FINN \nv. \nBALLENTINE PARTNERS LLC, Succe...
## $ Jurisdiction <chr> "Merrimack", "Merrimack", "Merrimack", "Merrimack...
data_belknapNbcdd <- rbind(data_belknap_processed, data_bcdd_processed)
glimpse(data_belknapNbcdd)
## Observations: 102
## Variables: 3
## $ doc_id <chr> " 211-2016-CV-00001", " 211-2016-CV-00002", " 211...
## $ text <chr> "STATE OF NEW HAMPSHIRE JUDICIAL BRANCH BELKNAP, ...
## $ Jurisdiction <chr> "Belknap", "Belknap", "Belknap", "Belknap", "Belk...
This data was collected by Cam and Brad Romeo in 2019 summer. There are 575 cases from Hillsborough North.
# Import and clean Hillsborough data that Cam and Brad collected
data_Hills <- readtext(paste0(DATA_DIR, "Cam/Complaint/*"))
data_Hills_processed <-
data_Hills %>%
# Take the first 9 letters of repeated doc_id format: e.g., 2017/0388.docx/0388.docx
mutate(doc_id = str_sub(doc_id, 1,9)) %>%
# separate docket number into county, year, type and case
separate(doc_id, c("year","caseN"), remove = FALSE) %>%
# Pad one more 0 in front of caseN
mutate(caseN = str_pad(caseN,width=5, side = "left", pad = "0"),
type = "CV",
county = "216",
Jurisdiction = "HillsN") %>%
# will need doc_id whose form is consistent with the one in the spreadsheet for merging
unite(doc_id, c(county,year,type,caseN), sep = "-", remove = FALSE) %>%
select(doc_id, text, Jurisdiction)
glimpse(data_Hills_processed)
## Observations: 575
## Variables: 3
## $ doc_id <chr> "216-2016-CV-00022", "216-2016-CV-00026", "216-20...
## $ text <chr> "THE STATE OF NEW HAMPSHIRESUPERIOR COURT\nHILLSB...
## $ Jurisdiction <chr> "HillsN", "HillsN", "HillsN", "HillsN", "HillsN",...
data_merged <- rbind(data_belknap_processed, data_bcdd_processed, data_Hills_processed)
glimpse(data_merged)
## Observations: 677
## Variables: 3
## $ doc_id <chr> " 211-2016-CV-00001", " 211-2016-CV-00002", " 211...
## $ text <chr> "STATE OF NEW HAMPSHIRE JUDICIAL BRANCH BELKNAP, ...
## $ Jurisdiction <chr> "Belknap", "Belknap", "Belknap", "Belknap", "Belk...
data_processed <-
data_merged %>%
rename(DoNo = doc_id)
glimpse(data_processed)
## Observations: 677
## Variables: 3
## $ DoNo <chr> " 211-2016-CV-00001", " 211-2016-CV-00002", " 211...
## $ text <chr> "STATE OF NEW HAMPSHIRE JUDICIAL BRANCH BELKNAP, ...
## $ Jurisdiction <chr> "Belknap", "Belknap", "Belknap", "Belknap", "Belk...
A challenge here is that I should remove not only commone English words (e.g., is, his, the) but also legal jargons. I used tf_idf process instead of the common anti_join(stop_words) approach because stop_words doesn’t include legal jargons.
I used the three filtering mechanisms: 1) stop_words for common English words (i.e., is, and his), 2) inverse document frequency (idf) to control for legal jargons, and 3) minimum number of documents that the word shows up, which is to control for pronouns (i.e., names of the plaintiff).
The analysis below includes the words that satisfy the following three conditions:
Note that idf of 0 means that the word shows up in all documents: all 677 cases. Many legal jargans would surely have idf of 0 or a very low number. The higher idf as a filter, the less likely that legal jargons will be filtered out (a stricter standard) but at the risk of excluding potentially important words in the analysis. Another challenge in an analysis like ours is a lot of pronouns (e.g., names of the defendant and the plaintiff) specific to each case. One way to filter pronouns is to set a minimum number of cases that the word shows up. Note that a low number of the minimum number of document requirements may cause many pairs of words with perfect correlation due to the presence of many pronouns. In our data, 10 was too small a number that caused many pairs of perfect correlation. Setting 20 resolved the issue of many pairs of words with perfect correlation.
library(tidytext)
# how many times each court case contains each word
data_word <-
data_processed %>%
unnest_tokens(word, text) %>%
count(DoNo, word, sort = TRUE) %>%
bind_tf_idf(word, DoNo, n) %>%
# filter out legal jargons that appear in almost all cases
filter(idf >= 0.5) %>%
# filter out common English
anti_join(stop_words) %>%
# remove numbers
filter(str_detect(word, "[a-z]")) %>%
filter(!str_detect(word, "[0-9]"))
# Remove words with small # of occurencies
data_word_filtered <-
data_word %>%
# count the word once per case
distinct(DoNo, word) %>%
add_count(word) %>%
filter(n >= 20)
glimpse(data_word_filtered)
## Observations: 150,343
## Variables: 3
## $ DoNo <chr> " 217-2012-CV-00658", "216-2017-CV-00466", "216-2016-CV-0...
## $ word <chr> "plaintiffs", "note", "borrower", "board", "borrower", "b...
## $ n <int> 315, 106, 42, 65, 42, 42, 42, 51, 51, 42, 51, 54, 48, 51,...
#write.csv(data_word_filtered, "C:/Users/sclee1/OneDrive/Documents/R/legalAnalytics/data/stateCases_words.csv")
# Import and clean Hillsborough data that Cam and Brad collected
data_Hills <- readtext(paste0(DATA_DIR, "Cam/Complaint/*"))
data_Hills_processed <-
data_Hills %>%
# Take the first 9 letters of repeated doc_id format: e.g., 2017/0388.docx/0388.docx
mutate(doc_id = str_sub(doc_id, 1,9)) %>%
# separate docket number into county, year, type and case
separate(doc_id, c("year","caseN"), remove = FALSE) %>%
# Pad one more 0 in front of caseN
mutate(caseN = str_pad(caseN,width=5, side = "left", pad = "0"),
type = "CV",
county = "216") %>%
# will need doc_id whose form is consistent with the one in the spreadsheet for merging
unite(doc_id, c(county,year,type,caseN), sep = "-", remove = FALSE)
glimpse(data_Hills_processed)
### Using anti_join(stop_words)
library(tidytext)
data_word <-
data_processed %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
filter(str_detect(word, regex("[a-z]", ignore_case = TRUE))) %>%
# remove common legal words
filter(!word %in% c(""))
data_word %>%
count(word, sort = TRUE)
# Remove words with small # of occurencies
data_word_filtered <-
data_word %>%
# count only once the same words per review
distinct(doc_id, word) %>%
add_count(word) %>%
filter(n >= 50)
head(data_word_filtered)
# a good resource for useful stringr function
# https://www.brodrigues.co/blog/2018-06-10-scraping_pdfs/
# Load readtext package
library(pdftools)
library(tidyverse)
# Import files
DATA_DIR <- "e:/Court Files/"
data <- pdf_text(paste0(DATA_DIR, "Belknap Court/211-2016-CV-00001.pdf"))
data
str(data)
cat(data[2])
pdf_data(data)[1]
table <-
data[1] %>%
str_split("\n", simplify = TRUE) %>%
str_squish()
caseN <- str_which(table, "Case No\\.")
caseN
table[3]
# Load readtext package
library(readtext)
library(tidyverse)
# Import files
DATA_DIR <- "C:/Users/sclee1/OneDrive/Documents/research paper/Legal Analytics/data/stateCases/"
data_belknap <- readtext(paste0(DATA_DIR, "SamNJeramiah/Belknap Court/*"))
glimpse(data_belknap)
data_bcdd <- readtext(paste0(DATA_DIR, "SamNJeramiah/BCDD (Concord Court)/*"))
glimpse(data_bcdd)