library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(quanteda)
## Package version: 3.0.0
## Unicode version: 10.0
## ICU version: 61.1
## Parallel computing: 6 of 6 threads used.
## See https://quanteda.io for tutorials and examples.
library(readtext)
library(quanteda.textplots)
library(spacyr)
library(stopwords)
library(devtools)
## Loading required package: usethis
library(tidytext)
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
library(dplyr)
my_texts <- readtext::readtext("C:/Users/kebre/Rwd/Minimum Wage Research/State")
#State
minwage_corpus <- corpus(my_texts)
minwage_summary <- summary(minwage_corpus)
minwage_summary
## Corpus consisting of 14 documents, showing 14 documents:
##
## Text Types Tokens Sentences
## Arkansas.pdf 2049 14290 675
## Colorado.pdf 118 185 7
## Florida.1.txt 399 1503 49
## Florida.2.txt 383 978 18
## FLSA.pdf 3386 77668 2357
## Georgia.txt 185 394 6
## Massachusetts.pdf 267 2332 22
## Minnesota.pdf 528 2172 46
## notes.txt 116 770 1
## Pennsylvania.pdf 1513 10363 493
## Texas.txt 799 4088 245
## Utah.pdf 551 2343 48
## Wisconsin.pdf 2487 21060 793
## Wyoming.pdf 5445 97310 2094
#State
docvars(minwage_corpus)
## data frame with 0 columns and 14 rows
minwage_summary$book <- "Minimum Wage Legislation"
minwage_summary
## Corpus consisting of 14 documents, showing 14 documents:
##
## Text Types Tokens Sentences book
## Arkansas.pdf 2049 14290 675 Minimum Wage Legislation
## Colorado.pdf 118 185 7 Minimum Wage Legislation
## Florida.1.txt 399 1503 49 Minimum Wage Legislation
## Florida.2.txt 383 978 18 Minimum Wage Legislation
## FLSA.pdf 3386 77668 2357 Minimum Wage Legislation
## Georgia.txt 185 394 6 Minimum Wage Legislation
## Massachusetts.pdf 267 2332 22 Minimum Wage Legislation
## Minnesota.pdf 528 2172 46 Minimum Wage Legislation
## notes.txt 116 770 1 Minimum Wage Legislation
## Pennsylvania.pdf 1513 10363 493 Minimum Wage Legislation
## Texas.txt 799 4088 245 Minimum Wage Legislation
## Utah.pdf 551 2343 48 Minimum Wage Legislation
## Wisconsin.pdf 2487 21060 793 Minimum Wage Legislation
## Wyoming.pdf 5445 97310 2094 Minimum Wage Legislation
minwage_summary$chapter <- as.numeric(str_extract(minwage_summary$Text, "[0-9]+"))
minwage_summary
## Corpus consisting of 14 documents, showing 14 documents:
##
## Text Types Tokens Sentences book chapter
## Arkansas.pdf 2049 14290 675 Minimum Wage Legislation NA
## Colorado.pdf 118 185 7 Minimum Wage Legislation NA
## Florida.1.txt 399 1503 49 Minimum Wage Legislation 1
## Florida.2.txt 383 978 18 Minimum Wage Legislation 2
## FLSA.pdf 3386 77668 2357 Minimum Wage Legislation NA
## Georgia.txt 185 394 6 Minimum Wage Legislation NA
## Massachusetts.pdf 267 2332 22 Minimum Wage Legislation NA
## Minnesota.pdf 528 2172 46 Minimum Wage Legislation NA
## notes.txt 116 770 1 Minimum Wage Legislation NA
## Pennsylvania.pdf 1513 10363 493 Minimum Wage Legislation NA
## Texas.txt 799 4088 245 Minimum Wage Legislation NA
## Utah.pdf 551 2343 48 Minimum Wage Legislation NA
## Wisconsin.pdf 2487 21060 793 Minimum Wage Legislation NA
## Wyoming.pdf 5445 97310 2094 Minimum Wage Legislation NA
docvars(minwage_corpus) <- minwage_summary
docvars(minwage_corpus)
## Text Types Tokens Sentences book chapter
## 1 Arkansas.pdf 2049 14290 675 Minimum Wage Legislation NA
## 2 Colorado.pdf 118 185 7 Minimum Wage Legislation NA
## 3 Florida.1.txt 399 1503 49 Minimum Wage Legislation 1
## 4 Florida.2.txt 383 978 18 Minimum Wage Legislation 2
## 5 FLSA.pdf 3386 77668 2357 Minimum Wage Legislation NA
## 6 Georgia.txt 185 394 6 Minimum Wage Legislation NA
## 7 Massachusetts.pdf 267 2332 22 Minimum Wage Legislation NA
## 8 Minnesota.pdf 528 2172 46 Minimum Wage Legislation NA
## 9 notes.txt 116 770 1 Minimum Wage Legislation NA
## 10 Pennsylvania.pdf 1513 10363 493 Minimum Wage Legislation NA
## 11 Texas.txt 799 4088 245 Minimum Wage Legislation NA
## 12 Utah.pdf 551 2343 48 Minimum Wage Legislation NA
## 13 Wisconsin.pdf 2487 21060 793 Minimum Wage Legislation NA
## 14 Wyoming.pdf 5445 97310 2094 Minimum Wage Legislation NA
#State
small_corpus <- corpus_subset(minwage_corpus, Tokens < 5000)
summary(small_corpus)
## Corpus consisting of 9 documents, showing 9 documents:
##
## Text Types Tokens Sentences Text Types Tokens
## Colorado.pdf 118 185 7 Colorado.pdf 118 185
## Florida.1.txt 399 1503 49 Florida.1.txt 399 1503
## Florida.2.txt 383 978 18 Florida.2.txt 383 978
## Georgia.txt 185 394 6 Georgia.txt 185 394
## Massachusetts.pdf 267 2332 22 Massachusetts.pdf 267 2332
## Minnesota.pdf 528 2172 46 Minnesota.pdf 528 2172
## notes.txt 116 770 1 notes.txt 116 770
## Texas.txt 799 4088 245 Texas.txt 799 4088
## Utah.pdf 551 2343 48 Utah.pdf 551 2343
## Sentences book chapter
## 7 Minimum Wage Legislation NA
## 49 Minimum Wage Legislation 1
## 18 Minimum Wage Legislation 2
## 6 Minimum Wage Legislation NA
## 22 Minimum Wage Legislation NA
## 46 Minimum Wage Legislation NA
## 1 Minimum Wage Legislation NA
## 245 Minimum Wage Legislation NA
## 48 Minimum Wage Legislation NA
#State
# the default breaks on white space
minwage_tokens <- tokens(minwage_corpus)
print(minwage_tokens)
## Tokens consisting of 14 documents and 6 docvars.
## Arkansas.pdf :
## [1] "AGENCY" "#010.14" "ADMINISTRATIVE" "RULES"
## [5] "REGARDING" "THE" "ARKANSAS" "MINIMUM"
## [9] "WAGE" "ACT" "LABOR" "STANDARDS"
## [ ... and 14,278 more ]
##
## Colorado.pdf :
## [1] "Division" "of" "Labor" "Standards" "and"
## [6] "Statistics" "633" "17th" "Street" ","
## [11] "Suite" "600"
## [ ... and 173 more ]
##
## Florida.1.txt :
## [1] "448.110â" "\200" "ƒState" "minimum"
## [5] "wage" ";" "annual" "wage"
## [9] "adjustment" ";" "enforcement.â" "\200"
## [ ... and 1,491 more ]
##
## Florida.2.txt :
## [1] "SECTION" "24" "." "â" "\200" "ƒFlorida"
## [7] "minimum" "wage.â" "\200" "\"" "(" "a"
## [ ... and 966 more ]
##
## FLSA.pdf :
## [1] "29" "USC" "Ch" "." "8" ":"
## [7] "FAIR" "LABOR" "STANDARDS" "From" "Title" "29"
## [ ... and 77,656 more ]
##
## Georgia.txt :
## [1] "2018" "Georgia" "Code" "Title" "34"
## [6] "-" "Labor" "and" "Industrial" "Relations"
## [11] "Chapter" "4"
## [ ... and 382 more ]
##
## [ reached max_ndoc ... 8 more documents ]
# as well as numbers
minwage_tokens <- tokens(minwage_corpus,
remove_punct = T,
remove_numbers = T)
print(minwage_tokens)
## Tokens consisting of 14 documents and 6 docvars.
## Arkansas.pdf :
## [1] "AGENCY" "#010.14" "ADMINISTRATIVE" "RULES"
## [5] "REGARDING" "THE" "ARKANSAS" "MINIMUM"
## [9] "WAGE" "ACT" "LABOR" "STANDARDS"
## [ ... and 11,752 more ]
##
## Colorado.pdf :
## [1] "Division" "of" "Labor" "Standards" "and"
## [6] "Statistics" "17th" "Street" "Suite" "Denver"
## [11] "CO" "80202-2107"
## [ ... and 125 more ]
##
## Florida.1.txt :
## [1] "448.110â" "\200" "ƒState" "minimum"
## [5] "wage" "annual" "wage" "adjustment"
## [9] "enforcement.â" "\200" "â" "\200"
## [ ... and 1,258 more ]
##
## Florida.2.txt :
## [1] "SECTION" "â" "\200" "ƒFlorida" "minimum" "wage.â"
## [7] "\200" "a" "â" "\200" "ƒPUBLIC" "POLICY.â"
## [ ... and 858 more ]
##
## FLSA.pdf :
## [1] "USC" "Ch" "FAIR" "LABOR" "STANDARDS" "From"
## [7] "Title" "LABOR" "CHAPTER" "FAIR" "LABOR" "STANDARDS"
## [ ... and 52,565 more ]
##
## Georgia.txt :
## [1] "Georgia" "Code" "Title" "Labor" "and"
## [6] "Industrial" "Relations" "Chapter" "Minimum" "Wage"
## [11] "Law" "Â"
## [ ... and 308 more ]
##
## [ reached max_ndoc ... 8 more documents ]
#employee/employer may/may not
kwic.1 <- kwic(minwage_tokens,
pattern = phrase("employer may"))
head(kwic.1)
## Keyword-in-context with 6 matches.
## [Arkansas.pdf, 895:896] working on fixed schedules an | employer may
## [Arkansas.pdf, 1058:1059] Arkansas In unusual circumstances an | employer may
## [Arkansas.pdf, 1942:1943] A Conditions of employment An | employer may
## [Arkansas.pdf, 2287:2288] Learners Learners and Apprentices An | employer may
## [Arkansas.pdf, 6018:6019] Deductions from minimum wage An | employer may
## [Arkansas.pdf, 6069:6070] the employee in writing An | employer may
##
## | maintain records showing instead of
## | petition the director to maintain
## | pay a full-time student a
## | employ a learner a student
## | not make deductions from the
## | not make deductions from the
kwic.2 <- kwic(minwage_tokens,
pattern = phrase("employee may"))
head(kwic.2)
## Keyword-in-context with 6 matches.
## [Arkansas.pdf, 7641:7642] work time For example an |
## [Arkansas.pdf, 8596:8597] more the employer and the |
## [Arkansas.pdf, 10565:10566] Enforcement A Employee Claims An |
## [FLSA.pdf, 16692:16693] maximum period during which an |
## [FLSA.pdf, 16725:16726] <U+FFFD> <U+FFFD> <U+FFFD> No eligible |
## [FLSA.pdf, 17345:17346] i and ii that an |
##
## employee may | voluntarily continue to work at
## employee may | agree to exclude bona fide
## employee may | file a claim with the
## employee may | be paid such wage as
## employee may | be paid the wage authorized
## employee may | be paid the wage authorized
kwic.3 <- kwic(minwage_tokens,
pattern = phrase("employer may not"))
head(kwic.3)
## Keyword-in-context with 6 matches.
## [Arkansas.pdf, 6018:6020] Deductions from minimum wage An | employer may not |
## [Arkansas.pdf, 6069:6071] the employee in writing An | employer may not |
## [FLSA.pdf, 3157:3159] regularly receive tips B An | employer may not |
## [FLSA.pdf, 4630:4632] of tips determined by the | employer may not |
## [FLSA.pdf, 5222:5224] of tips determined by the | employer may not |
## [Texas.txt, 1121:1123] EMPLOYEES SUBJECT TO CALL An | employer may not |
##
## make deductions from the minimum
## make deductions from the applicable
## keep tips received by its
## exceed the value of tips
## exceed the value of tips
## be required to pay an
kwic.4 <- kwic(minwage_tokens,
pattern = phrase("employee may not"))
head(kwic.4)
## Keyword-in-context with 2 matches.
## [Texas.txt, 2602:2604] similarly affected employees b An |
## [Wisconsin.pdf, 13362:13364] uninterrupted or employees relieving that |
##
## employee may not | be a plaintiff to an
## employee may not | be on duty for more
Objective: Define the parts of institutional statements to locate and hand code documents
Attribute: An actor (individual or corporate) that carries out, or is expected to (or not to) carry out the action(Aim) of the statement. (May contain descriptors of the actor)
Aim: The goal or action of the statement assigned to the statement Attribute.
Context: The context instantiates settings in which the focal action of a statement applies, or qualifies the action indicated in an institutional statement.
Object: The inanimate or animate part of an institutional statement that is the receiver of the action captured in the Aim.
Deontic: A prescriptive operator that defines to what extent the action of an institutional statement is compelled, restrained, or discretionary.
Or else: An incentivising provision associated with te action indicated in a particular institutional statement that can exist wholly within an institutional statement, or be represented in a nested institutional statement
Example:
“…a written notification of proposed suspension orrevocation of certification …”
Attribute: Aim: “may”, “may not”, “shall”, “shall not” Context: Object: “employer”, “employee” Deontic: “wage”, “minimum”, “poor”, “work”, “labor”, “families”, “percent”, “level”, “one”, “union”, “non”, “state”, “poverty”, “income” Or else:
I worked with Nvivo to atuo-code the documents, that way I could pick out which words I want to focus on:
Word Frequency in State Corpus
“Wage” Query in State Corpus
“May” Query in State Corpus
“May Not” Query in State Corpus
I also created a binder with all the documents and cheat sheets as a guide
Objective: Create document feature matrix to identify where key terms show up throughout the states
# create the dfm
minwage_dfm <- dfm(minwage_corpus,
tolower = TRUE,
remove_punct = TRUE,
stem = FALSE,
remove = stopwords("english")
)
## Warning: 'dfm.corpus()' is deprecated. Use 'tokens()' first.
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
## Warning: 'remove' is deprecated; use dfm_remove() instead
## Warning: 'stem' is deprecated; use dfm_wordstem() instead
# find out a quick summary of the dfm
minwage_dfm
## Document-feature matrix of: 14 documents, 5,533 features (84.93% sparse) and 6 docvars.
## features
## docs agenc #010.14 administr rule regard arkansa minimum wage act
## Arkansas.pdf 12 1 14 67 4 14 38 81 67
## Colorado.pdf 0 0 0 0 0 0 5 5 0
## Florida.1.txt 1 0 0 2 0 0 21 27 7
## Florida.2.txt 2 0 1 2 0 0 13 16 1
## FLSA.pdf 104 0 86 12 19 0 118 252 272
## Georgia.txt 0 0 0 0 0 0 6 6 2
## features
## docs labor
## Arkansas.pdf 43
## Colorado.pdf 1
## Florida.1.txt 5
## Florida.2.txt 2
## FLSA.pdf 231
## Georgia.txt 1
## [ reached max_ndoc ... 8 more documents, reached max_nfeat ... 5,523 more features ]
topfeatures(minwage_dfm, 20)
## <U+FFFD> employ employe shall section 1 act wage may b
## 6320 2779 1929 1905 1165 1126 1021 963 921 871
## work provid rate state 3 2 hour l servic pub
## 774 771 756 725 718 694 687 655 626 608
set.seed(1234)
# draw the wordcloud
textplot_wordcloud(minwage_dfm, min_count = 50, random_order = FALSE)
minwageDict <- dictionary(list(articles = c("the", "a", "and"),
conjunctions = c("and", "but", "or", "nor", "for", "yet", "so"),
deontics = c("wage", "minimum", "poor", "work", "labor", "families", "percent", "level", "one", "union", "non", "state", "poverty", "income"),
employee = c("employee"),
employer = c("employer"),
posactions = c("may", "shall"),
negactions = c("may not", "shall not")))
minwageDfm <- dfm(minwage_corpus, dictionary = minwageDict)
## Warning: 'dfm.corpus()' is deprecated. Use 'tokens()' first.
## Warning: 'dictionary' and 'thesaurus' are deprecated; use dfm_lookup() instead
minwageDfm[1:10,]
## Document-feature matrix of: 10 documents, 7 features (27.14% sparse) and 6 docvars.
## features
## docs articles conjunctions deontics employee employer posactions
## Arkansas.pdf 1403 825 236 152 109 108
## Colorado.pdf 9 6 15 0 0 2
## Florida.1.txt 151 64 71 0 8 34
## Florida.2.txt 82 65 40 0 3 24
## FLSA.pdf 4820 2883 795 359 191 593
## Georgia.txt 23 21 14 3 7 6
## features
## docs negactions
## Arkansas.pdf 12
## Colorado.pdf 0
## Florida.1.txt 2
## Florida.2.txt 2
## FLSA.pdf 73
## Georgia.txt 3
## [ reached max_ndoc ... 4 more documents ]
For the next steps, I can look into how the positive and negative action terms interact with the employer vs employee terms, as well as the various other key words I’ve identified throughout the documents.