Import data

library(dplyr)
data <- read.csv("complaint.csv")
data <-
  data %>%
  select(docket_number, complaint)
colnames(data) <- c("book", "text")
data$text <- as.character(data$text)
str(data)
## 'data.frame':    52 obs. of  2 variables:
##  $ book: Factor w/ 52 levels "","1:05-cv-00330-SM",..: 37 42 18 23 3 24 30 7 34 25 ...
##  $ text: chr  "THE STATE OF NEW HAMPSHIRE\nJUD|C!AL BRANCH\nSUPERlOR COURT .\nCarroll Superior Court l Telephone: 1\xbb855~212~1234\n\n96 Wate "                           UNITED STATES DISTRICT COURT\n                          FOR THE DISTRICT OF NEW HAMPSHIRE\n\n******* "Case 1:11-cv-00278-JD Document 1-1 Filed 06/06/11 Page 1 of 15\n\n\n\n\n           EXHIBIT 1\n            Case 1:11-cv-00278-JD "            Case 1:12-cv-00384-JD Document 1 Filed 10/11/12 Page 1 of 3\n\n\n                            UNITED STATES DISTRICT ...

Identify business type

Identify cases that doesn’t involve businesses and filter them out. Business are indentified by the following terms: LLC, PC, PA, Inc, corp, LP, LLP, PLP, PLLP, PLLC, dba, d/b/a, pbc, gp.

library(stringr)
data_filtered <-
  data %>%
  mutate(busType = str_detect(text, regex("L.*L.*C.*|P.*C.*|P.*A.*|Inc.*|L.*P.*|L.*L.*P.*|P.*L.*P.*|
                                          P.*L.*L.*P.*|P.*L.*L.*C.*|d.*b.*a.*|d.*b.*a.*|d/b/a|
                                          p.*b.*c.*|g.*p.*", 
                                          ignore_case = TRUE))) %>%
  filter(busType == TRUE) %>%
  select(book, text)
# "." matches any character except a newline; "*" matches at least 0 times;
str(data_filtered)
## 'data.frame':    33 obs. of  2 variables:
##  $ book: Factor w/ 52 levels "","1:05-cv-00330-SM",..: 37 42 18 23 24 34 25 16 17 36 ...
##  $ text: chr  "THE STATE OF NEW HAMPSHIRE\nJUD|C!AL BRANCH\nSUPERlOR COURT .\nCarroll Superior Court l Telephone: 1\xbb855~212~1234\n\n96 Wate "                           UNITED STATES DISTRICT COURT\n                          FOR THE DISTRICT OF NEW HAMPSHIRE\n\n******* "Case 1:11-cv-00278-JD Document 1-1 Filed 06/06/11 Page 1 of 15\n\n\n\n\n           EXHIBIT 1\n            Case 1:11-cv-00278-JD "            Case 1:12-cv-00384-JD Document 1 Filed 10/11/12 Page 1 of 3\n\n\n                            UNITED STATES DISTRICT ...

Extract info on plaintiff and defendant

data_filtered %>%
  mutate(plaintiff = str_extract(text, regex("plaintiff\\s[a-z]*\\s[a-z]*", ignore_case = TRUE))) %>% # "." matches any character except a newline; "*" matches at least 0 times; \\s any white space
  select(book, plaintiff) %>%
  head()
##               book                plaintiff
## 1    1:15-cv-00363       Plaintiff seeks to
## 2    1:16-cv-00177 plaintiff Tracy Westcott
## 3 1:11-cv-00278-JD Plaintiff Kaitlin Hudson
## 4 1:12-cv-00384-JD   Plaintiff Amber Doolen
## 5 1:13-cv-00295-PB                     <NA>
## 6    1:15-cv-00096 plaintiff Denny Densmore

data_filtered %>%
  mutate(defendant = str_extract(text, regex("defendant\\s[a-z]*\\s[a-z]*\\s.*\\s", ignore_case = TRUE))) %>% # "." matches any character except a newline; "*" matches at least 0 times; \\s any white space
  select(book, defendant) %>%
  head()
##               book
## 1    1:15-cv-00363
## 2    1:16-cv-00177
## 3 1:11-cv-00278-JD
## 4 1:12-cv-00384-JD
## 5 1:13-cv-00295-PB
## 6    1:15-cv-00096
##                                                                               defendant
## 1         Defendant Diversified Clinicai Services, Inc., (hereinafter collectively as\n
## 2                                                            defendant and resides at\n
## 3 Defendant Pain Care had actual knowledge, knew, or should have known of O'Connell's\n
## 4                             defendant Best Buy Stores, LP (\x93Best Buy\x94) hereby\n
## 5                                                           Defendant               )\n
## 6      Defendant has previously administered and will administer the USMLE Step Exams\n

Extract date filed

data_filtered %>%
  mutate(date_filed = str_extract(text, regex("filed.*\\s\\w+", ignore_case = TRUE))) %>% 
  select(book, date_filed) %>%
  head(10)
##                 book                  date_filed
## 1      1:15-cv-00363      Filed: August 04, 2015
## 2      1:16-cv-00177  Filed 05/02/16 Page 2 of 8
## 3   1:11-cv-00278-JD Filed 06/06/11 Page 1 of 15
## 4   1:12-cv-00384-JD  Filed 10/11/12 Page 1 of 3
## 5   1:13-cv-00295-PB Filed 06/28/13 Page 1 of 10
## 6      1:15-cv-00096                        <NA>
## 7   1:13-cv-00318-LM  Filed 07/15/13 Page 1 of 2
## 8   1:10-cv-00170-JL  Filed 05/04/10 Page 1 of 7
## 9  1:10-cv-00218-GZS Filed 06/07/10 Page 1 of 11
## 10     1:15-cv-00235 Filed 06/19/15 Page 1 of 15
# \w matches any “word” character, which includes alphabetic characters, marks and decimal numbers