library(dplyr)
data <- read.csv("complaint.csv")
data <-
data %>%
select(docket_number, complaint)
colnames(data) <- c("book", "text")
data$text <- as.character(data$text)
str(data)
## 'data.frame': 52 obs. of 2 variables:
## $ book: Factor w/ 52 levels "","1:05-cv-00330-SM",..: 37 42 18 23 3 24 30 7 34 25 ...
## $ text: chr "THE STATE OF NEW HAMPSHIRE\nJUD|C!AL BRANCH\nSUPERlOR COURT .\nCarroll Superior Court l Telephone: 1\xbb855~212~1234\n\n96 Wate " UNITED STATES DISTRICT COURT\n FOR THE DISTRICT OF NEW HAMPSHIRE\n\n******* "Case 1:11-cv-00278-JD Document 1-1 Filed 06/06/11 Page 1 of 15\n\n\n\n\n EXHIBIT 1\n Case 1:11-cv-00278-JD " Case 1:12-cv-00384-JD Document 1 Filed 10/11/12 Page 1 of 3\n\n\n UNITED STATES DISTRICT ...
Identify cases that doesn’t involve businesses and filter them out. Business are indentified by the following terms: LLC, PC, PA, Inc, corp, LP, LLP, PLP, PLLP, PLLC, dba, d/b/a, pbc, gp.
library(stringr)
data_filtered <-
data %>%
mutate(busType = str_detect(text, regex("L.*L.*C.*|P.*C.*|P.*A.*|Inc.*|L.*P.*|L.*L.*P.*|P.*L.*P.*|
P.*L.*L.*P.*|P.*L.*L.*C.*|d.*b.*a.*|d.*b.*a.*|d/b/a|
p.*b.*c.*|g.*p.*",
ignore_case = TRUE))) %>%
filter(busType == TRUE) %>%
select(book, text)
# "." matches any character except a newline; "*" matches at least 0 times;
str(data_filtered)
## 'data.frame': 33 obs. of 2 variables:
## $ book: Factor w/ 52 levels "","1:05-cv-00330-SM",..: 37 42 18 23 24 34 25 16 17 36 ...
## $ text: chr "THE STATE OF NEW HAMPSHIRE\nJUD|C!AL BRANCH\nSUPERlOR COURT .\nCarroll Superior Court l Telephone: 1\xbb855~212~1234\n\n96 Wate " UNITED STATES DISTRICT COURT\n FOR THE DISTRICT OF NEW HAMPSHIRE\n\n******* "Case 1:11-cv-00278-JD Document 1-1 Filed 06/06/11 Page 1 of 15\n\n\n\n\n EXHIBIT 1\n Case 1:11-cv-00278-JD " Case 1:12-cv-00384-JD Document 1 Filed 10/11/12 Page 1 of 3\n\n\n UNITED STATES DISTRICT ...
data_filtered %>%
mutate(plaintiff = str_extract(text, regex("plaintiff\\s[a-z]*\\s[a-z]*", ignore_case = TRUE))) %>% # "." matches any character except a newline; "*" matches at least 0 times; \\s any white space
select(book, plaintiff) %>%
head()
## book plaintiff
## 1 1:15-cv-00363 Plaintiff seeks to
## 2 1:16-cv-00177 plaintiff Tracy Westcott
## 3 1:11-cv-00278-JD Plaintiff Kaitlin Hudson
## 4 1:12-cv-00384-JD Plaintiff Amber Doolen
## 5 1:13-cv-00295-PB <NA>
## 6 1:15-cv-00096 plaintiff Denny Densmore
data_filtered %>%
mutate(defendant = str_extract(text, regex("defendant\\s[a-z]*\\s[a-z]*\\s.*\\s", ignore_case = TRUE))) %>% # "." matches any character except a newline; "*" matches at least 0 times; \\s any white space
select(book, defendant) %>%
head()
## book
## 1 1:15-cv-00363
## 2 1:16-cv-00177
## 3 1:11-cv-00278-JD
## 4 1:12-cv-00384-JD
## 5 1:13-cv-00295-PB
## 6 1:15-cv-00096
## defendant
## 1 Defendant Diversified Clinicai Services, Inc., (hereinafter collectively as\n
## 2 defendant and resides at\n
## 3 Defendant Pain Care had actual knowledge, knew, or should have known of O'Connell's\n
## 4 defendant Best Buy Stores, LP (\x93Best Buy\x94) hereby\n
## 5 Defendant )\n
## 6 Defendant has previously administered and will administer the USMLE Step Exams\n
data_filtered %>%
mutate(date_filed = str_extract(text, regex("filed.*\\s\\w+", ignore_case = TRUE))) %>%
select(book, date_filed) %>%
head(10)
## book date_filed
## 1 1:15-cv-00363 Filed: August 04, 2015
## 2 1:16-cv-00177 Filed 05/02/16 Page 2 of 8
## 3 1:11-cv-00278-JD Filed 06/06/11 Page 1 of 15
## 4 1:12-cv-00384-JD Filed 10/11/12 Page 1 of 3
## 5 1:13-cv-00295-PB Filed 06/28/13 Page 1 of 10
## 6 1:15-cv-00096 <NA>
## 7 1:13-cv-00318-LM Filed 07/15/13 Page 1 of 2
## 8 1:10-cv-00170-JL Filed 05/04/10 Page 1 of 7
## 9 1:10-cv-00218-GZS Filed 06/07/10 Page 1 of 11
## 10 1:15-cv-00235 Filed 06/19/15 Page 1 of 15
# \w matches any “word” character, which includes alphabetic characters, marks and decimal numbers