library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(quanteda)
## Package version: 3.0.0
## Unicode version: 10.0
## ICU version: 61.1
## Parallel computing: 6 of 6 threads used.
## See https://quanteda.io for tutorials and examples.
library(readtext)
library(quanteda.textmodels)
library(spacyr)
library(stopwords)
my_texts <- readtext::readtext("C:/Users/kebre/Rwd/Minimum Wage Research/State")
#State
minwage_corpus <- corpus(my_texts)
minwage_summary <- summary(minwage_corpus)
minwage_summary
## Corpus consisting of 13 documents, showing 13 documents:
##
## Text Types Tokens Sentences
## Arkansas.pdf 2049 14290 675
## Colorado.pdf 118 185 7
## Florida.1.txt 399 1503 49
## Florida.2.txt 383 978 18
## FLSA.pdf 3386 77668 2357
## Georgia.txt 185 394 6
## Massachusetts.pdf 267 2332 22
## Minnesota.pdf 528 2172 46
## Pennsylvania.pdf 1513 10363 493
## Texas.txt 799 4088 245
## Utah.pdf 551 2343 48
## Wisconsin.pdf 2487 21060 793
## Wyoming.pdf 5445 97310 2094
#State
docvars(minwage_corpus)
## data frame with 0 columns and 13 rows
minwage_summary$book <- "Minimum Wage Legislation"
minwage_summary
## Corpus consisting of 13 documents, showing 13 documents:
##
## Text Types Tokens Sentences book
## Arkansas.pdf 2049 14290 675 Minimum Wage Legislation
## Colorado.pdf 118 185 7 Minimum Wage Legislation
## Florida.1.txt 399 1503 49 Minimum Wage Legislation
## Florida.2.txt 383 978 18 Minimum Wage Legislation
## FLSA.pdf 3386 77668 2357 Minimum Wage Legislation
## Georgia.txt 185 394 6 Minimum Wage Legislation
## Massachusetts.pdf 267 2332 22 Minimum Wage Legislation
## Minnesota.pdf 528 2172 46 Minimum Wage Legislation
## Pennsylvania.pdf 1513 10363 493 Minimum Wage Legislation
## Texas.txt 799 4088 245 Minimum Wage Legislation
## Utah.pdf 551 2343 48 Minimum Wage Legislation
## Wisconsin.pdf 2487 21060 793 Minimum Wage Legislation
## Wyoming.pdf 5445 97310 2094 Minimum Wage Legislation
minwage_summary$chapter <- as.numeric(str_extract(minwage_summary$Text, "[0-9]+"))
minwage_summary
## Corpus consisting of 13 documents, showing 13 documents:
##
## Text Types Tokens Sentences book chapter
## Arkansas.pdf 2049 14290 675 Minimum Wage Legislation NA
## Colorado.pdf 118 185 7 Minimum Wage Legislation NA
## Florida.1.txt 399 1503 49 Minimum Wage Legislation 1
## Florida.2.txt 383 978 18 Minimum Wage Legislation 2
## FLSA.pdf 3386 77668 2357 Minimum Wage Legislation NA
## Georgia.txt 185 394 6 Minimum Wage Legislation NA
## Massachusetts.pdf 267 2332 22 Minimum Wage Legislation NA
## Minnesota.pdf 528 2172 46 Minimum Wage Legislation NA
## Pennsylvania.pdf 1513 10363 493 Minimum Wage Legislation NA
## Texas.txt 799 4088 245 Minimum Wage Legislation NA
## Utah.pdf 551 2343 48 Minimum Wage Legislation NA
## Wisconsin.pdf 2487 21060 793 Minimum Wage Legislation NA
## Wyoming.pdf 5445 97310 2094 Minimum Wage Legislation NA
docvars(minwage_corpus) <- minwage_summary
docvars(minwage_corpus)
## Text Types Tokens Sentences book chapter
## 1 Arkansas.pdf 2049 14290 675 Minimum Wage Legislation NA
## 2 Colorado.pdf 118 185 7 Minimum Wage Legislation NA
## 3 Florida.1.txt 399 1503 49 Minimum Wage Legislation 1
## 4 Florida.2.txt 383 978 18 Minimum Wage Legislation 2
## 5 FLSA.pdf 3386 77668 2357 Minimum Wage Legislation NA
## 6 Georgia.txt 185 394 6 Minimum Wage Legislation NA
## 7 Massachusetts.pdf 267 2332 22 Minimum Wage Legislation NA
## 8 Minnesota.pdf 528 2172 46 Minimum Wage Legislation NA
## 9 Pennsylvania.pdf 1513 10363 493 Minimum Wage Legislation NA
## 10 Texas.txt 799 4088 245 Minimum Wage Legislation NA
## 11 Utah.pdf 551 2343 48 Minimum Wage Legislation NA
## 12 Wisconsin.pdf 2487 21060 793 Minimum Wage Legislation NA
## 13 Wyoming.pdf 5445 97310 2094 Minimum Wage Legislation NA
#State
small_corpus <- corpus_subset(minwage_corpus, Tokens < 5000)
summary(small_corpus)
## Corpus consisting of 8 documents, showing 8 documents:
##
## Text Types Tokens Sentences Text Types Tokens
## Colorado.pdf 118 185 7 Colorado.pdf 118 185
## Florida.1.txt 399 1503 49 Florida.1.txt 399 1503
## Florida.2.txt 383 978 18 Florida.2.txt 383 978
## Georgia.txt 185 394 6 Georgia.txt 185 394
## Massachusetts.pdf 267 2332 22 Massachusetts.pdf 267 2332
## Minnesota.pdf 528 2172 46 Minnesota.pdf 528 2172
## Texas.txt 799 4088 245 Texas.txt 799 4088
## Utah.pdf 551 2343 48 Utah.pdf 551 2343
## Sentences book chapter
## 7 Minimum Wage Legislation NA
## 49 Minimum Wage Legislation 1
## 18 Minimum Wage Legislation 2
## 6 Minimum Wage Legislation NA
## 22 Minimum Wage Legislation NA
## 46 Minimum Wage Legislation NA
## 245 Minimum Wage Legislation NA
## 48 Minimum Wage Legislation NA
#State
# the default breaks on white space
minwage_tokens <- tokens(minwage_corpus)
print(minwage_tokens)
## Tokens consisting of 13 documents and 6 docvars.
## Arkansas.pdf :
## [1] "AGENCY" "#010.14" "ADMINISTRATIVE" "RULES"
## [5] "REGARDING" "THE" "ARKANSAS" "MINIMUM"
## [9] "WAGE" "ACT" "LABOR" "STANDARDS"
## [ ... and 14,278 more ]
##
## Colorado.pdf :
## [1] "Division" "of" "Labor" "Standards" "and"
## [6] "Statistics" "633" "17th" "Street" ","
## [11] "Suite" "600"
## [ ... and 173 more ]
##
## Florida.1.txt :
## [1] "448.110â" "\200" "ƒState" "minimum"
## [5] "wage" ";" "annual" "wage"
## [9] "adjustment" ";" "enforcement.â" "\200"
## [ ... and 1,491 more ]
##
## Florida.2.txt :
## [1] "SECTION" "24" "." "â" "\200" "ƒFlorida"
## [7] "minimum" "wage.â" "\200" "\"" "(" "a"
## [ ... and 966 more ]
##
## FLSA.pdf :
## [1] "29" "USC" "Ch" "." "8" ":"
## [7] "FAIR" "LABOR" "STANDARDS" "From" "Title" "29"
## [ ... and 77,656 more ]
##
## Georgia.txt :
## [1] "2018" "Georgia" "Code" "Title" "34"
## [6] "-" "Labor" "and" "Industrial" "Relations"
## [11] "Chapter" "4"
## [ ... and 382 more ]
##
## [ reached max_ndoc ... 7 more documents ]
# as well as numbers
minwage_tokens <- tokens(minwage_corpus,
remove_punct = T,
remove_numbers = T)
print(minwage_tokens)
## Tokens consisting of 13 documents and 6 docvars.
## Arkansas.pdf :
## [1] "AGENCY" "#010.14" "ADMINISTRATIVE" "RULES"
## [5] "REGARDING" "THE" "ARKANSAS" "MINIMUM"
## [9] "WAGE" "ACT" "LABOR" "STANDARDS"
## [ ... and 11,752 more ]
##
## Colorado.pdf :
## [1] "Division" "of" "Labor" "Standards" "and"
## [6] "Statistics" "17th" "Street" "Suite" "Denver"
## [11] "CO" "80202-2107"
## [ ... and 125 more ]
##
## Florida.1.txt :
## [1] "448.110â" "\200" "ƒState" "minimum"
## [5] "wage" "annual" "wage" "adjustment"
## [9] "enforcement.â" "\200" "â" "\200"
## [ ... and 1,258 more ]
##
## Florida.2.txt :
## [1] "SECTION" "â" "\200" "ƒFlorida" "minimum" "wage.â"
## [7] "\200" "a" "â" "\200" "ƒPUBLIC" "POLICY.â"
## [ ... and 858 more ]
##
## FLSA.pdf :
## [1] "USC" "Ch" "FAIR" "LABOR" "STANDARDS" "From"
## [7] "Title" "LABOR" "CHAPTER" "FAIR" "LABOR" "STANDARDS"
## [ ... and 52,565 more ]
##
## Georgia.txt :
## [1] "Georgia" "Code" "Title" "Labor" "and"
## [6] "Industrial" "Relations" "Chapter" "Minimum" "Wage"
## [11] "Law" "Â"
## [ ... and 308 more ]
##
## [ reached max_ndoc ... 7 more documents ]
#employee/employer may/may not
kwic.1 <- kwic(minwage_tokens,
pattern = phrase("employer may"))
head(kwic.1)
## Keyword-in-context with 6 matches.
## [Arkansas.pdf, 895:896] working on fixed schedules an | employer may
## [Arkansas.pdf, 1058:1059] Arkansas In unusual circumstances an | employer may
## [Arkansas.pdf, 1942:1943] A Conditions of employment An | employer may
## [Arkansas.pdf, 2287:2288] Learners Learners and Apprentices An | employer may
## [Arkansas.pdf, 6018:6019] Deductions from minimum wage An | employer may
## [Arkansas.pdf, 6069:6070] the employee in writing An | employer may
##
## | maintain records showing instead of
## | petition the director to maintain
## | pay a full-time student a
## | employ a learner a student
## | not make deductions from the
## | not make deductions from the
kwic.2 <- kwic(minwage_tokens,
pattern = phrase("employee may"))
head(kwic.2)
## Keyword-in-context with 6 matches.
## [Arkansas.pdf, 7641:7642] work time For example an |
## [Arkansas.pdf, 8596:8597] more the employer and the |
## [Arkansas.pdf, 10565:10566] Enforcement A Employee Claims An |
## [FLSA.pdf, 16692:16693] maximum period during which an |
## [FLSA.pdf, 16725:16726] <U+FFFD> <U+FFFD> <U+FFFD> No eligible |
## [FLSA.pdf, 17345:17346] i and ii that an |
##
## employee may | voluntarily continue to work at
## employee may | agree to exclude bona fide
## employee may | file a claim with the
## employee may | be paid such wage as
## employee may | be paid the wage authorized
## employee may | be paid the wage authorized
kwic.3 <- kwic(minwage_tokens,
pattern = phrase("employer may not"))
head(kwic.3)
## Keyword-in-context with 6 matches.
## [Arkansas.pdf, 6018:6020] Deductions from minimum wage An | employer may not |
## [Arkansas.pdf, 6069:6071] the employee in writing An | employer may not |
## [FLSA.pdf, 3157:3159] regularly receive tips B An | employer may not |
## [FLSA.pdf, 4630:4632] of tips determined by the | employer may not |
## [FLSA.pdf, 5222:5224] of tips determined by the | employer may not |
## [Texas.txt, 1121:1123] EMPLOYEES SUBJECT TO CALL An | employer may not |
##
## make deductions from the minimum
## make deductions from the applicable
## keep tips received by its
## exceed the value of tips
## exceed the value of tips
## be required to pay an
kwic.4 <- kwic(minwage_tokens,
pattern = phrase("employee may not"))
head(kwic.4)
## Keyword-in-context with 2 matches.
## [Texas.txt, 2602:2604] similarly affected employees b An |
## [Wisconsin.pdf, 13362:13364] uninterrupted or employees relieving that |
##
## employee may not | be a plaintiff to an
## employee may not | be on duty for more
#Creating Corpora *** #list out the object names needed myBooks <- c(“Minimum Wage Legislation”)
#create loop for(i in 1:length(myBooks)) { #create corppra corpusCall <- paste(myBooks[i],"_corpus <- corpus(“,myBooks[i],”)“, sep =”") eval(parse(text=corpusCall))
# change document names for each chapter to include the book title. If you don’t do this, the document names will be duplicated and you’ll get an error. namesCall <- paste(“tmpNames <- docnames(”,myBooks[i],"_corpus)“, sep =”“) eval(parse(text=namesCall)) bindCall <- paste(”docnames(“,myBooks[i],”_corpus) <- paste("“,myBooks[i],”", tmpNames, sep = "-")“, sep =”") eval(parse(text=bindCall))
# create summary data summaryCall <- paste(myBooks[i],"_summary <- summary(“,myBooks[i],”_corpus)“, sep =”") eval(parse(text=summaryCall))
# add indicator bookCall <- paste(myBooks[i],"_summary$book <- "“,myBooks[i],”"“, sep =”") eval(parse(text=bookCall))
# add chapter indicator chapterCall <- paste(myBooks[i],"_summary\(chapter <- as.numeric(str_extract(",myBooks[i],"_summary\)Text, "[0-9]+"))“, sep =”") eval(parse(text=chapterCall))
# add meta data to each corpus metaCall <- paste(“docvars(”,myBooks[i],"_corpus) <- “,myBooks[i],”_summary“, sep =”") eval(parse(text=metaCall))
}
docvars(minwage_corpus) ***
I am not sure how to get the loop above to accomodate my corpus. I have a state corpus, with one document per state, and a federal corpus with one documnent. I realize this isnt alot of text, but I am not sure how to easily create the chapters and book as was done in the tutorial. Other than that, so far so good!!
# pull out the data we want
myData <- docvars(minwage_corpus)
head(myData)
## Text Types Tokens Sentences book chapter
## 1 Arkansas.pdf 2049 14290 675 Minimum Wage Legislation NA
## 2 Colorado.pdf 118 185 7 Minimum Wage Legislation NA
## 3 Florida.1.txt 399 1503 49 Minimum Wage Legislation 1
## 4 Florida.2.txt 383 978 18 Minimum Wage Legislation 2
## 5 FLSA.pdf 3386 77668 2357 Minimum Wage Legislation NA
## 6 Georgia.txt 185 394 6 Minimum Wage Legislation NA
# now add the text to our data frame for running the annotation tool; column must be named `text`
myData$text <- text
annotated <- cnlp_annotate(myData)
## Processed document 10 of 13
head(annotated$token)
## # A tibble: 6 x 11
## doc_id sid tid token token_with_ws lemma upos xpos feats tid_source
## <int> <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1 1 1 AGENCY " ~ Agen~ ADV RB <NA> 2
## 2 1 1 2 # "#" # SYM NN Numb~ 0
## 3 1 1 3 010.14 "010.14\n\n\n\n\~ 010.~ NUM CD NumT~ 2
## 4 1 2 1 ADMIN~ "ADMINISTRATIVE " admi~ ADJ JJ Degr~ 2
## 5 1 2 2 RULES "RULES " rule NOUN NNS Numb~ 0
## 6 1 2 3 REGAR~ "REGARDING " rega~ VERB VBG Verb~ 8
## # ... with 1 more variable: relation <chr>
head(annotated$document)
## Text Types Tokens Sentences book chapter doc_id
## 1 Arkansas.pdf 2049 14290 675 Minimum Wage Legislation NA 1
## 2 Colorado.pdf 118 185 7 Minimum Wage Legislation NA 2
## 3 Florida.1.txt 399 1503 49 Minimum Wage Legislation 1 3
## 4 Florida.2.txt 383 978 18 Minimum Wage Legislation 2 4
## 5 FLSA.pdf 3386 77668 2357 Minimum Wage Legislation NA 5
## 6 Georgia.txt 185 394 6 Minimum Wage Legislation NA 6
annoData <- left_join(annotated$document, annotated$token, by = "doc_id")
head(annoData)
## Text Types Tokens Sentences book chapter doc_id
## 1 Arkansas.pdf 2049 14290 675 Minimum Wage Legislation NA 1
## 2 Arkansas.pdf 2049 14290 675 Minimum Wage Legislation NA 1
## 3 Arkansas.pdf 2049 14290 675 Minimum Wage Legislation NA 1
## 4 Arkansas.pdf 2049 14290 675 Minimum Wage Legislation NA 1
## 5 Arkansas.pdf 2049 14290 675 Minimum Wage Legislation NA 1
## 6 Arkansas.pdf 2049 14290 675 Minimum Wage Legislation NA 1
## sid tid token
## 1 1 1 AGENCY
## 2 1 2 #
## 3 1 3 010.14
## 4 2 1 ADMINISTRATIVE
## 5 2 2 RULES
## 6 2 3 REGARDING
## token_with_ws lemma
## 1 AGENCY Agency
## 2 # #
## 3 010.14\n\n\n\n\n 010.14
## 4 ADMINISTRATIVE administrative
## 5 RULES rule
## 6 REGARDING regard
## upos xpos feats tid_source relation
## 1 ADV RB <NA> 2 advmod
## 2 SYM NN Number=Sing 0 root
## 3 NUM CD NumType=Card 2 nummod
## 4 ADJ JJ Degree=Pos 2 amod
## 5 NOUN NNS Number=Plur 0 root
## 6 VERB VBG VerbForm=Ger 8 case
cbind(head(annoData$token,40), head(annoData$lemma,40))
## [,1] [,2]
## [1,] "AGENCY" "Agency"
## [2,] "#" "#"
## [3,] "010.14" "010.14"
## [4,] "ADMINISTRATIVE" "administrative"
## [5,] "RULES" "rule"
## [6,] "REGARDING" "regard"
## [7,] "THE" "the"
## [8,] "ARKANSAS" "Arkansas"
## [9,] "MINIMUM" "minimum"
## [10,] "WAGE" "WAGE"
## [11,] "ACT" "Act"
## [12,] "LABOR" "Labor"
## [13,] "STANDARDS" "standards"
## [14,] "SECTION" "section"
## [15,] "DIVISION" "division"
## [16,] "OF" "of"
## [17,] "LABOR" "Labor"
## [18,] "ARKANSAS" "Arkansas"
## [19,] "DEPARTMENT" "Department"
## [20,] "OF" "of"
## [21,] "LABOR" "Labor"
## [22,] "AND" "and"
## [23,] "LICENSING" "Licensing"
## [24,] "Rules" "rule"
## [25,] "effective" "effective"
## [26,] "as" "as"
## [27,] "of" "of"
## [28,] "July" "July"
## [29,] "2" "2"
## [30,] "," ","
## [31,] "2020" "2020"
## [32,] "Arkansas" "Arkansas"
## [33,] "Department" "Department"
## [34,] "of" "of"
## [35,] "Labor" "Labor"
## [36,] "and" "and"
## [37,] "Licensing" "Licensing"
## [38,] "Division" "division"
## [39,] "of" "of"
## [40,] "Labor" "Labor"
It looks like natural language processing isn’t too helpful in my case, the language in these documents isn’t exactly natural! Its institutional!