Load packages
library(readr)
library(quanteda)
## Package version: 4.2.0
## Unicode version: 14.0
## ICU version: 71.1
## Parallel computing: disabled
## See https://quanteda.io for tutorials and examples.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(distrom)
## Loading required package: Matrix
## Loading required package: gamlr
## Loading required package: parallel
##
## Attaching package: 'distrom'
## The following object is masked from 'package:dplyr':
##
## collapse
library(textir)
library(quanteda.textplots)
Load file
Exec_Orders <- read_csv("EO_All_WithText.csv")
## New names:
## Rows: 6648 Columns: 21
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (14): citation, document_number, html_url, pdf_url, type, subtype, titl... dbl
## (4): ...1, end_page, start_page, year lgl (1): not_received_for_publication
## date (2): publication_date, signing_date
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
Loop that takes titles out of text
for (i in 1:nrow(Exec_Orders)) {
Exec_Orders$text[i] <- sub(Exec_Orders$title[i], "", Exec_Orders$text[i])
}
All president signatures for later use
pres <- c("GWBOLD", "BIDEN", "(Presidential Sig.)", "OB#1", "Trump", "JOSEPH R. BIDEN JR.")
pres <- str_c(pres, collapse = "|")
EO specific stopwords for later use
eo_stopwords <- c(
# Generic EO scaffolding
"section","sections","subsection","subsections","paragraph","paragraphs",
"clause","clauses","order","orders","executive","presidential","proclamation","ii","iii",
# Formal opening boilerplate
"authority","vested","constitution","laws","united","states","america",
"federal","government","department","agencies","agency","office","official",
"secretary","director","administrator","council","board",
# Common verbs in EO legal phrasing
"shall","hereby","thereof","therein","herein","within","thereby","pursuant",
# Frequent EO nouns that are usually boilerplate
"policy","policies","program","programs","initiative","initiatives",
"report","reports","guidance","requirements","implementation",
"plan","plans","regulation","regulations","rule","rules",
# Common structural/administrative filler
"established","establish","establishment","create","created","creation",
"amend","amended","amendment","termination","terminate","revoked","revoke",
"effective","date","period","days","duration","applicable","application",
# Geographic boilerplate
"national","nation","state","states","district","territory","territories",
"region","regions","regional",
# Miscellaneous
"executed","execution","signed","sign","issuance","issued",
"implement","implementation")
eo_stopwords <- str_c(eo_stopwords, collapse = "|")
Filter to include 2001-present because preamble is consistent
EO_recents <- Exec_Orders %>%
filter(signing_date > "2001-01-18")
Create corpus
EO_recents_corpus <- corpus(EO_recents,
text_field = "text")
## Warning: NA is replaced by empty string
Preprocessing Remove everything before first “Executive Order” for texts where preamble has important info
clean_EO <- sub(".*?Executive Order", "", EO_recents_corpus)
Remove preamble – everything before “United States of America,”
clean_EO <- sub(".*?of the United States of America", "", clean_EO)
Remove all “”
clean_EO <-gsub("\n", "", clean_EO)
Remove president name
clean_EO <- str_remove_all(clean_EO, regex(pres, ignore_case = TRUE))
Remove ending (EPS worked for 2008-present, white house first does 2001-2008)
clean_EO <- sub("THE WHITE HOUSE.*", "", clean_EO, ignore.case = FALSE)
clean_EO <- sub("\\.EPS.*", "", clean_EO)
Remove eo specific stopwords
clean_EO <- str_remove_all(clean_EO, regex(eo_stopwords, ignore_case = TRUE))
Tokenizing
EO_tokens <- tokens(clean_EO,
remove_punct = T,
remove_numbers = T) %>%
tokens_remove(stopwords(source = 'snowball')) %>%
tokens_wordstem() %>%
tokens_tolower() %>%
tokens_ngrams(n = 1:3) %>%
#remove letter labels (i.e. a, b, c, i)
tokens_keep(min_nchar = 2)
Make dfm
EO_dfm <- dfm(EO_tokens)
Trim words that are super frequent/infrequent
EO_dfm <- dfm_trim(EO_dfm, min_docfreq = 3, docfreq_type = "count")
EO_dfm <- dfm_trim(EO_dfm, max_docfreq = .8*ndoc(EO_recents_corpus), docfreq_type = "count")
topfeatures(EO_dfm, n = 20)
## secur presid appropri act servic provid may consist
## 4130 3787 3733 3684 3533 3396 3162 2798
## inform develop u.s.c head administr public manag action
## 2795 2534 2439 2287 2259 2174 2173 2081
## use relat support gener
## 2063 1993 1912 1831
textplot_wordcloud(EO_dfm, max_words = 100)