Load packages

library(readr)
library(quanteda)
## Package version: 4.2.0
## Unicode version: 14.0
## ICU version: 71.1
## Parallel computing: disabled
## See https://quanteda.io for tutorials and examples.
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(distrom)
## Loading required package: Matrix
## Loading required package: gamlr
## Loading required package: parallel
## 
## Attaching package: 'distrom'
## The following object is masked from 'package:dplyr':
## 
##     collapse
library(textir)
library(quanteda.textplots)

Load file

Exec_Orders <- read_csv("EO_All_WithText.csv")
## New names:
## Rows: 6648 Columns: 21
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (14): citation, document_number, html_url, pdf_url, type, subtype, titl... dbl
## (4): ...1, end_page, start_page, year lgl (1): not_received_for_publication
## date (2): publication_date, signing_date
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

Loop that takes titles out of text

for (i in 1:nrow(Exec_Orders)) {
  Exec_Orders$text[i] <- sub(Exec_Orders$title[i], "", Exec_Orders$text[i])
}

All president signatures for later use

pres <- c("GWBOLD", "BIDEN", "(Presidential Sig.)", "OB#1", "Trump", "JOSEPH R. BIDEN JR.")
pres <- str_c(pres, collapse = "|")

EO specific stopwords for later use

eo_stopwords <- c(
  # Generic EO scaffolding
  "section","sections","subsection","subsections","paragraph","paragraphs",
  "clause","clauses","order","orders","executive","presidential","proclamation","ii","iii",
  # Formal opening boilerplate
  "authority","vested","constitution","laws","united","states","america",
  "federal","government","department","agencies","agency","office","official",
  "secretary","director","administrator","council","board",
  # Common verbs in EO legal phrasing
  "shall","hereby","thereof","therein","herein","within","thereby","pursuant",
  # Frequent EO nouns that are usually boilerplate
  "policy","policies","program","programs","initiative","initiatives",
  "report","reports","guidance","requirements","implementation",
  "plan","plans","regulation","regulations","rule","rules",
  # Common structural/administrative filler
  "established","establish","establishment","create","created","creation",
  "amend","amended","amendment","termination","terminate","revoked","revoke",
  "effective","date","period","days","duration","applicable","application",
  # Geographic boilerplate
  "national","nation","state","states","district","territory","territories",
  "region","regions","regional",
  # Miscellaneous
  "executed","execution","signed","sign","issuance","issued",
  "implement","implementation")
eo_stopwords <- str_c(eo_stopwords, collapse = "|")

Filter to include 2001-present because preamble is consistent

EO_recents <- Exec_Orders %>% 
  filter(signing_date > "2001-01-18")

Create corpus

EO_recents_corpus <- corpus(EO_recents,
                            text_field = "text")
## Warning: NA is replaced by empty string

Preprocessing Remove everything before first “Executive Order” for texts where preamble has important info

clean_EO <- sub(".*?Executive Order", "", EO_recents_corpus)

Remove preamble – everything before “United States of America,”

clean_EO <- sub(".*?of the United States of America", "", clean_EO)

Remove all “”

clean_EO <-gsub("\n", "", clean_EO)

Remove president name

clean_EO <- str_remove_all(clean_EO, regex(pres, ignore_case = TRUE))

Remove ending (EPS worked for 2008-present, white house first does 2001-2008)

clean_EO <- sub("THE WHITE HOUSE.*", "", clean_EO, ignore.case = FALSE)
clean_EO <- sub("\\.EPS.*", "", clean_EO)

Remove eo specific stopwords

clean_EO <- str_remove_all(clean_EO, regex(eo_stopwords, ignore_case = TRUE))

Tokenizing

EO_tokens <- tokens(clean_EO,
                    remove_punct = T,
                    remove_numbers = T) %>% 
  tokens_remove(stopwords(source = 'snowball')) %>% 
  tokens_wordstem() %>% 
  tokens_tolower() %>% 
  tokens_ngrams(n = 1:3) %>% 
  #remove letter labels (i.e. a, b, c, i)
  tokens_keep(min_nchar = 2)

Make dfm

EO_dfm <- dfm(EO_tokens)

Trim words that are super frequent/infrequent

EO_dfm <- dfm_trim(EO_dfm, min_docfreq = 3, docfreq_type = "count")
EO_dfm <- dfm_trim(EO_dfm, max_docfreq = .8*ndoc(EO_recents_corpus), docfreq_type = "count")
topfeatures(EO_dfm, n = 20)
##     secur    presid  appropri       act    servic    provid       may   consist 
##      4130      3787      3733      3684      3533      3396      3162      2798 
##    inform   develop     u.s.c      head administr    public     manag    action 
##      2795      2534      2439      2287      2259      2174      2173      2081 
##       use     relat   support     gener 
##      2063      1993      1912      1831
textplot_wordcloud(EO_dfm, max_words = 100)