Week 2

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.1     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(quanteda)
## Package version: 3.0.0
## Unicode version: 10.0
## ICU version: 61.1
## Parallel computing: 6 of 6 threads used.
## See https://quanteda.io for tutorials and examples.
library(readtext)
library(quanteda.textmodels)
library(spacyr)
library(stopwords)

Load Files

my_texts <- readtext::readtext("C:/Users/kebre/Rwd/Minimum Wage Research/State")

Build Corpus

#State
minwage_corpus <- corpus(my_texts)
minwage_summary <- summary(minwage_corpus)
minwage_summary
## Corpus consisting of 13 documents, showing 13 documents:
## 
##               Text Types Tokens Sentences
##       Arkansas.pdf  2049  14290       675
##       Colorado.pdf   118    185         7
##      Florida.1.txt   399   1503        49
##      Florida.2.txt   383    978        18
##           FLSA.pdf  3386  77668      2357
##        Georgia.txt   185    394         6
##  Massachusetts.pdf   267   2332        22
##      Minnesota.pdf   528   2172        46
##   Pennsylvania.pdf  1513  10363       493
##          Texas.txt   799   4088       245
##           Utah.pdf   551   2343        48
##      Wisconsin.pdf  2487  21060       793
##        Wyoming.pdf  5445  97310      2094

Metadata

#State
docvars(minwage_corpus)
## data frame with 0 columns and 13 rows
minwage_summary$book <- "Minimum Wage Legislation"
minwage_summary
## Corpus consisting of 13 documents, showing 13 documents:
## 
##               Text Types Tokens Sentences                     book
##       Arkansas.pdf  2049  14290       675 Minimum Wage Legislation
##       Colorado.pdf   118    185         7 Minimum Wage Legislation
##      Florida.1.txt   399   1503        49 Minimum Wage Legislation
##      Florida.2.txt   383    978        18 Minimum Wage Legislation
##           FLSA.pdf  3386  77668      2357 Minimum Wage Legislation
##        Georgia.txt   185    394         6 Minimum Wage Legislation
##  Massachusetts.pdf   267   2332        22 Minimum Wage Legislation
##      Minnesota.pdf   528   2172        46 Minimum Wage Legislation
##   Pennsylvania.pdf  1513  10363       493 Minimum Wage Legislation
##          Texas.txt   799   4088       245 Minimum Wage Legislation
##           Utah.pdf   551   2343        48 Minimum Wage Legislation
##      Wisconsin.pdf  2487  21060       793 Minimum Wage Legislation
##        Wyoming.pdf  5445  97310      2094 Minimum Wage Legislation
minwage_summary$chapter <- as.numeric(str_extract(minwage_summary$Text, "[0-9]+"))
minwage_summary
## Corpus consisting of 13 documents, showing 13 documents:
## 
##               Text Types Tokens Sentences                     book chapter
##       Arkansas.pdf  2049  14290       675 Minimum Wage Legislation      NA
##       Colorado.pdf   118    185         7 Minimum Wage Legislation      NA
##      Florida.1.txt   399   1503        49 Minimum Wage Legislation       1
##      Florida.2.txt   383    978        18 Minimum Wage Legislation       2
##           FLSA.pdf  3386  77668      2357 Minimum Wage Legislation      NA
##        Georgia.txt   185    394         6 Minimum Wage Legislation      NA
##  Massachusetts.pdf   267   2332        22 Minimum Wage Legislation      NA
##      Minnesota.pdf   528   2172        46 Minimum Wage Legislation      NA
##   Pennsylvania.pdf  1513  10363       493 Minimum Wage Legislation      NA
##          Texas.txt   799   4088       245 Minimum Wage Legislation      NA
##           Utah.pdf   551   2343        48 Minimum Wage Legislation      NA
##      Wisconsin.pdf  2487  21060       793 Minimum Wage Legislation      NA
##        Wyoming.pdf  5445  97310      2094 Minimum Wage Legislation      NA
docvars(minwage_corpus) <- minwage_summary
docvars(minwage_corpus)
##                 Text Types Tokens Sentences                     book chapter
## 1       Arkansas.pdf  2049  14290       675 Minimum Wage Legislation      NA
## 2       Colorado.pdf   118    185         7 Minimum Wage Legislation      NA
## 3      Florida.1.txt   399   1503        49 Minimum Wage Legislation       1
## 4      Florida.2.txt   383    978        18 Minimum Wage Legislation       2
## 5           FLSA.pdf  3386  77668      2357 Minimum Wage Legislation      NA
## 6        Georgia.txt   185    394         6 Minimum Wage Legislation      NA
## 7  Massachusetts.pdf   267   2332        22 Minimum Wage Legislation      NA
## 8      Minnesota.pdf   528   2172        46 Minimum Wage Legislation      NA
## 9   Pennsylvania.pdf  1513  10363       493 Minimum Wage Legislation      NA
## 10         Texas.txt   799   4088       245 Minimum Wage Legislation      NA
## 11          Utah.pdf   551   2343        48 Minimum Wage Legislation      NA
## 12     Wisconsin.pdf  2487  21060       793 Minimum Wage Legislation      NA
## 13       Wyoming.pdf  5445  97310      2094 Minimum Wage Legislation      NA

Smol

#State
small_corpus <- corpus_subset(minwage_corpus, Tokens < 5000)
summary(small_corpus)
## Corpus consisting of 8 documents, showing 8 documents:
## 
##               Text Types Tokens Sentences              Text Types Tokens
##       Colorado.pdf   118    185         7      Colorado.pdf   118    185
##      Florida.1.txt   399   1503        49     Florida.1.txt   399   1503
##      Florida.2.txt   383    978        18     Florida.2.txt   383    978
##        Georgia.txt   185    394         6       Georgia.txt   185    394
##  Massachusetts.pdf   267   2332        22 Massachusetts.pdf   267   2332
##      Minnesota.pdf   528   2172        46     Minnesota.pdf   528   2172
##          Texas.txt   799   4088       245         Texas.txt   799   4088
##           Utah.pdf   551   2343        48          Utah.pdf   551   2343
##  Sentences                     book chapter
##          7 Minimum Wage Legislation      NA
##         49 Minimum Wage Legislation       1
##         18 Minimum Wage Legislation       2
##          6 Minimum Wage Legislation      NA
##         22 Minimum Wage Legislation      NA
##         46 Minimum Wage Legislation      NA
##        245 Minimum Wage Legislation      NA
##         48 Minimum Wage Legislation      NA

White space

#State
# the default breaks on white space
minwage_tokens <- tokens(minwage_corpus)
print(minwage_tokens)
## Tokens consisting of 13 documents and 6 docvars.
## Arkansas.pdf :
##  [1] "AGENCY"         "#010.14"        "ADMINISTRATIVE" "RULES"         
##  [5] "REGARDING"      "THE"            "ARKANSAS"       "MINIMUM"       
##  [9] "WAGE"           "ACT"            "LABOR"          "STANDARDS"     
## [ ... and 14,278 more ]
## 
## Colorado.pdf :
##  [1] "Division"   "of"         "Labor"      "Standards"  "and"       
##  [6] "Statistics" "633"        "17th"       "Street"     ","         
## [11] "Suite"      "600"       
## [ ... and 173 more ]
## 
## Florida.1.txt :
##  [1] "448.110â"      "\200"             "ƒState"        "minimum"      
##  [5] "wage"          ";"             "annual"        "wage"         
##  [9] "adjustment"    ";"             "enforcement.â" "\200"            
## [ ... and 1,491 more ]
## 
## Florida.2.txt :
##  [1] "SECTION"  "24"       "."        "â"        "\200"        "ƒFlorida"
##  [7] "minimum"  "wage.â"   "\200"        "\""       "("        "a"       
## [ ... and 966 more ]
## 
## FLSA.pdf :
##  [1] "29"        "USC"       "Ch"        "."         "8"         ":"        
##  [7] "FAIR"      "LABOR"     "STANDARDS" "From"      "Title"     "29"       
## [ ... and 77,656 more ]
## 
## Georgia.txt :
##  [1] "2018"       "Georgia"    "Code"       "Title"      "34"        
##  [6] "-"          "Labor"      "and"        "Industrial" "Relations" 
## [11] "Chapter"    "4"         
## [ ... and 382 more ]
## 
## [ reached max_ndoc ... 7 more documents ]
# as well as numbers
minwage_tokens <- tokens(minwage_corpus, 
    remove_punct = T,
    remove_numbers = T)
print(minwage_tokens)
## Tokens consisting of 13 documents and 6 docvars.
## Arkansas.pdf :
##  [1] "AGENCY"         "#010.14"        "ADMINISTRATIVE" "RULES"         
##  [5] "REGARDING"      "THE"            "ARKANSAS"       "MINIMUM"       
##  [9] "WAGE"           "ACT"            "LABOR"          "STANDARDS"     
## [ ... and 11,752 more ]
## 
## Colorado.pdf :
##  [1] "Division"   "of"         "Labor"      "Standards"  "and"       
##  [6] "Statistics" "17th"       "Street"     "Suite"      "Denver"    
## [11] "CO"         "80202-2107"
## [ ... and 125 more ]
## 
## Florida.1.txt :
##  [1] "448.110â"      "\200"             "ƒState"        "minimum"      
##  [5] "wage"          "annual"        "wage"          "adjustment"   
##  [9] "enforcement.â" "\200"             "â"             "\200"            
## [ ... and 1,258 more ]
## 
## Florida.2.txt :
##  [1] "SECTION"  "â"        "\200"        "ƒFlorida" "minimum"  "wage.â"  
##  [7] "\200"        "a"        "â"        "\200"        "ƒPUBLIC"  "POLICY.â"
## [ ... and 858 more ]
## 
## FLSA.pdf :
##  [1] "USC"       "Ch"        "FAIR"      "LABOR"     "STANDARDS" "From"     
##  [7] "Title"     "LABOR"     "CHAPTER"   "FAIR"      "LABOR"     "STANDARDS"
## [ ... and 52,565 more ]
## 
## Georgia.txt :
##  [1] "Georgia"    "Code"       "Title"      "Labor"      "and"       
##  [6] "Industrial" "Relations"  "Chapter"    "Minimum"    "Wage"      
## [11] "Law"        "Â"         
## [ ... and 308 more ]
## 
## [ reached max_ndoc ... 7 more documents ]

Keyowrd in Context

#employee/employer may/may not
kwic.1 <- kwic(minwage_tokens,
      pattern = phrase("employer may"))
head(kwic.1)
## Keyword-in-context with 6 matches.                                                                              
##    [Arkansas.pdf, 895:896]        working on fixed schedules an | employer may
##  [Arkansas.pdf, 1058:1059] Arkansas In unusual circumstances an | employer may
##  [Arkansas.pdf, 1942:1943]        A Conditions of employment An | employer may
##  [Arkansas.pdf, 2287:2288] Learners Learners and Apprentices An | employer may
##  [Arkansas.pdf, 6018:6019]      Deductions from minimum wage An | employer may
##  [Arkansas.pdf, 6069:6070]           the employee in writing An | employer may
##                                       
##  | maintain records showing instead of
##  | petition the director to maintain  
##  | pay a full-time student a          
##  | employ a learner a student         
##  | not make deductions from the       
##  | not make deductions from the
kwic.2 <- kwic(minwage_tokens,
      pattern = phrase("employee may"))
head(kwic.2)
## Keyword-in-context with 6 matches.                                                                     
##    [Arkansas.pdf, 7641:7642]               work time For example an |
##    [Arkansas.pdf, 8596:8597]              more the employer and the |
##  [Arkansas.pdf, 10565:10566]       Enforcement A Employee Claims An |
##      [FLSA.pdf, 16692:16693]         maximum period during which an |
##      [FLSA.pdf, 16725:16726] <U+FFFD> <U+FFFD> <U+FFFD> No eligible |
##      [FLSA.pdf, 17345:17346]                       i and ii that an |
##                                                
##  employee may | voluntarily continue to work at
##  employee may | agree to exclude bona fide     
##  employee may | file a claim with the          
##  employee may | be paid such wage as           
##  employee may | be paid the wage authorized    
##  employee may | be paid the wage authorized
kwic.3 <- kwic(minwage_tokens,
      pattern = phrase("employer may not"))
head(kwic.3)
## Keyword-in-context with 6 matches.                                                                               
##  [Arkansas.pdf, 6018:6020] Deductions from minimum wage An | employer may not |
##  [Arkansas.pdf, 6069:6071]      the employee in writing An | employer may not |
##      [FLSA.pdf, 3157:3159]     regularly receive tips B An | employer may not |
##      [FLSA.pdf, 4630:4632]       of tips determined by the | employer may not |
##      [FLSA.pdf, 5222:5224]       of tips determined by the | employer may not |
##     [Texas.txt, 1121:1123]    EMPLOYEES SUBJECT TO CALL An | employer may not |
##                                     
##  make deductions from the minimum   
##  make deductions from the applicable
##  keep tips received by its          
##  exceed the value of tips           
##  exceed the value of tips           
##  be required to pay an
kwic.4 <- kwic(minwage_tokens,
      pattern = phrase("employee may not"))
head(kwic.4)
## Keyword-in-context with 2 matches.                                                                         
##        [Texas.txt, 2602:2604]         similarly affected employees b An |
##  [Wisconsin.pdf, 13362:13364] uninterrupted or employees relieving that |
##                                         
##  employee may not | be a plaintiff to an
##  employee may not | be on duty for more

#Creating Corpora *** #list out the object names needed myBooks <- c(“Minimum Wage Legislation”)

#create loop for(i in 1:length(myBooks)) { #create corppra corpusCall <- paste(myBooks[i],"_corpus <- corpus(“,myBooks[i],”)“, sep =”") eval(parse(text=corpusCall))

# change document names for each chapter to include the book title. If you don’t do this, the document names will be duplicated and you’ll get an error. namesCall <- paste(“tmpNames <- docnames(”,myBooks[i],"_corpus)“, sep =”“) eval(parse(text=namesCall)) bindCall <- paste(”docnames(“,myBooks[i],”_corpus) <- paste("“,myBooks[i],”", tmpNames, sep = "-")“, sep =”") eval(parse(text=bindCall))

# create summary data summaryCall <- paste(myBooks[i],"_summary <- summary(“,myBooks[i],”_corpus)“, sep =”") eval(parse(text=summaryCall))

# add indicator bookCall <- paste(myBooks[i],"_summary$book <- "“,myBooks[i],”"“, sep =”") eval(parse(text=bookCall))

# add chapter indicator chapterCall <- paste(myBooks[i],"_summary\(chapter <- as.numeric(str_extract(",myBooks[i],"_summary\)Text, "[0-9]+"))“, sep =”") eval(parse(text=chapterCall))

# add meta data to each corpus metaCall <- paste(“docvars(”,myBooks[i],"_corpus) <- “,myBooks[i],”_summary“, sep =”") eval(parse(text=metaCall))

}

once the loop finishes up, check to make sure you’ve created what you want

docvars(minwage_corpus) ***

Reflection

I am not sure how to get the loop above to accomodate my corpus. I have a state corpus, with one document per state, and a federal corpus with one documnent. I realize this isnt alot of text, but I am not sure how to easily create the chapters and book as was done in the tutorial. Other than that, so far so good!!

Week 3

Load Libraries

# pull out the data we want
myData <- docvars(minwage_corpus)
head(myData)
##            Text Types Tokens Sentences                     book chapter
## 1  Arkansas.pdf  2049  14290       675 Minimum Wage Legislation      NA
## 2  Colorado.pdf   118    185         7 Minimum Wage Legislation      NA
## 3 Florida.1.txt   399   1503        49 Minimum Wage Legislation       1
## 4 Florida.2.txt   383    978        18 Minimum Wage Legislation       2
## 5      FLSA.pdf  3386  77668      2357 Minimum Wage Legislation      NA
## 6   Georgia.txt   185    394         6 Minimum Wage Legislation      NA
# now add the text to our data frame for running the annotation tool; column must be named `text`
myData$text <- text
annotated <- cnlp_annotate(myData)
## Processed document 10 of 13
head(annotated$token)
## # A tibble: 6 x 11
##   doc_id   sid tid   token  token_with_ws     lemma upos  xpos  feats tid_source
##    <int> <int> <chr> <chr>  <chr>             <chr> <chr> <chr> <chr> <chr>     
## 1      1     1 1     AGENCY "               ~ Agen~ ADV   RB    <NA>  2         
## 2      1     1 2     #      "#"               #     SYM   NN    Numb~ 0         
## 3      1     1 3     010.14 "010.14\n\n\n\n\~ 010.~ NUM   CD    NumT~ 2         
## 4      1     2 1     ADMIN~ "ADMINISTRATIVE " admi~ ADJ   JJ    Degr~ 2         
## 5      1     2 2     RULES  "RULES "          rule  NOUN  NNS   Numb~ 0         
## 6      1     2 3     REGAR~ "REGARDING "      rega~ VERB  VBG   Verb~ 8         
## # ... with 1 more variable: relation <chr>
head(annotated$document)
##            Text Types Tokens Sentences                     book chapter doc_id
## 1  Arkansas.pdf  2049  14290       675 Minimum Wage Legislation      NA      1
## 2  Colorado.pdf   118    185         7 Minimum Wage Legislation      NA      2
## 3 Florida.1.txt   399   1503        49 Minimum Wage Legislation       1      3
## 4 Florida.2.txt   383    978        18 Minimum Wage Legislation       2      4
## 5      FLSA.pdf  3386  77668      2357 Minimum Wage Legislation      NA      5
## 6   Georgia.txt   185    394         6 Minimum Wage Legislation      NA      6
annoData <- left_join(annotated$document, annotated$token, by = "doc_id")
head(annoData)
##           Text Types Tokens Sentences                     book chapter doc_id
## 1 Arkansas.pdf  2049  14290       675 Minimum Wage Legislation      NA      1
## 2 Arkansas.pdf  2049  14290       675 Minimum Wage Legislation      NA      1
## 3 Arkansas.pdf  2049  14290       675 Minimum Wage Legislation      NA      1
## 4 Arkansas.pdf  2049  14290       675 Minimum Wage Legislation      NA      1
## 5 Arkansas.pdf  2049  14290       675 Minimum Wage Legislation      NA      1
## 6 Arkansas.pdf  2049  14290       675 Minimum Wage Legislation      NA      1
##   sid tid          token
## 1   1   1         AGENCY
## 2   1   2              #
## 3   1   3         010.14
## 4   2   1 ADMINISTRATIVE
## 5   2   2          RULES
## 6   2   3      REGARDING
##                                                   token_with_ws          lemma
## 1                                                       AGENCY          Agency
## 2                                                             #              #
## 3                                              010.14\n\n\n\n\n         010.14
## 4                                               ADMINISTRATIVE  administrative
## 5                                                        RULES            rule
## 6                                                    REGARDING          regard
##   upos xpos        feats tid_source relation
## 1  ADV   RB         <NA>          2   advmod
## 2  SYM   NN  Number=Sing          0     root
## 3  NUM   CD NumType=Card          2   nummod
## 4  ADJ   JJ   Degree=Pos          2     amod
## 5 NOUN  NNS  Number=Plur          0     root
## 6 VERB  VBG VerbForm=Ger          8     case
cbind(head(annoData$token,40), head(annoData$lemma,40))
##       [,1]             [,2]            
##  [1,] "AGENCY"         "Agency"        
##  [2,] "#"              "#"             
##  [3,] "010.14"         "010.14"        
##  [4,] "ADMINISTRATIVE" "administrative"
##  [5,] "RULES"          "rule"          
##  [6,] "REGARDING"      "regard"        
##  [7,] "THE"            "the"           
##  [8,] "ARKANSAS"       "Arkansas"      
##  [9,] "MINIMUM"        "minimum"       
## [10,] "WAGE"           "WAGE"          
## [11,] "ACT"            "Act"           
## [12,] "LABOR"          "Labor"         
## [13,] "STANDARDS"      "standards"     
## [14,] "SECTION"        "section"       
## [15,] "DIVISION"       "division"      
## [16,] "OF"             "of"            
## [17,] "LABOR"          "Labor"         
## [18,] "ARKANSAS"       "Arkansas"      
## [19,] "DEPARTMENT"     "Department"    
## [20,] "OF"             "of"            
## [21,] "LABOR"          "Labor"         
## [22,] "AND"            "and"           
## [23,] "LICENSING"      "Licensing"     
## [24,] "Rules"          "rule"          
## [25,] "effective"      "effective"     
## [26,] "as"             "as"            
## [27,] "of"             "of"            
## [28,] "July"           "July"          
## [29,] "2"              "2"             
## [30,] ","              ","             
## [31,] "2020"           "2020"          
## [32,] "Arkansas"       "Arkansas"      
## [33,] "Department"     "Department"    
## [34,] "of"             "of"            
## [35,] "Labor"          "Labor"         
## [36,] "and"            "and"           
## [37,] "Licensing"      "Licensing"     
## [38,] "Division"       "division"      
## [39,] "of"             "of"            
## [40,] "Labor"          "Labor"

Reflection

It looks like natural language processing isn’t too helpful in my case, the language in these documents isn’t exactly natural! Its institutional!