Bootstrapping Federalist Papers

We will need the following packages.

library(rvest)
library(tidytext)
library(tidyverse)

Web scrape the information.

# The following URLs are needed.
# Jay, Hamilton, and Madison
Link1 <- read_html("https://guides.loc.gov/federalist-papers/text-1-10#s-lg-box-wrapper-25493264") 

# Hamilton and Madison (co-authors)
Link2 <- read_html("https://guides.loc.gov/federalist-papers/text-11-20#s-lg-box-wrapper-25493282")

# Unknown author
Link3 <- read_html("https://guides.loc.gov/federalist-papers/text-51-60#s-lg-box-wrapper-25493427")

# NOTE: The heading with paper numbers and the text are different elements in 
# HTML, so by the nature of the html_nodes() function in rvest, two separate
# elements cannot be pulled at the same time. It makes the most sense to pull
# the text from the <p> element and separate the papers based on subtitle and
# ending text, often the word "PUBLIUS."

# Prep the URLs by retrieving the text and storing it in a table. By default,
# the text saves to a column named ".", so use base R to rename the column. 
# tidyverse is particular with the use of periods, so this will make the 
# FPClean script a bit easier to use.
Link1_prep <- Link1 %>%
  html_nodes("p") %>%
  html_text() %>%
  tibble()

colnames(Link1_prep) <- "body_text"

Link2_prep <- Link2 %>%
  html_nodes("p") %>%
  html_text() %>%
  tibble()

colnames(Link2_prep) <- "body_text"

Link3_prep <- Link3 %>%
  html_nodes("p") %>%
  html_text() %>%
  tibble()

colnames(Link3_prep) <- "body_text"

# Pull paper by author.
# Hamilton - Concerning Dangers from Dissensions Between the States
Hamilton <- Link1_prep %>% slice(99:119)

# Madison - The Same Subject Continued: The Union as a Safeguard Against 
#           Domestic Faction and Insurrection
Madison <- Link1_prep %>% slice(178:201)

# Jay - Concerning Dangers from Foreign Force and Influence
Jay <- Link1_prep %>% slice(19:34)

# Hamilton and Madison - The Same Subject Continued: The Insufficiency of the 
#                         Present Confederation to Preserve the Union
HamiltonMadison <- Link2_prep %>% slice(121:142)

#Unknown - The Structure of the Government Must Furnish the Proper Checks and 
#         Balances Between the Different Departments
Unknown <- Link3_prep %>% slice(4:6)

# Export the papers as CSV files. 
# Make the tables into a list, and use a for loop to write the list into CSV
# files with the table names as the file names . write_csv is very temperamental 
# when used with loops, so use base-R functions in the for loop.
paper_list <- list(
  "Hamilton" = Hamilton, 
  "Madison" = Madison, 
  "Jay" = Jay, 
  "HamiltonMadison" = HamiltonMadison,
  "Unknown" = Unknown
  )

# Use a for loop to write papers into CSV files. Although this can be written
# as one xlsx file with multiple sheets, using several CSV files is the more
# practical choice for R.

# for(x in 1 : length(paper_list)) {
#   write.csv(
#     paper_list[[x]], 
#     file = paste0(names(paper_list)[[x]], ".csv")
#     )
# }

Prep the data and find average word length.

# Find the average word length of the above documents. 
# A for loop could work, but it acts difficult when unnest_tokens() is called
# since the input column cannot be pulled from lists. It makes more sense to 
# create an author column for each paper, row-bind the tables, and group by 
# author before applying unnest_tokens(). 
Hamilton <- Hamilton %>% mutate(author = "Hamilton")
Madison <- Madison %>% mutate(author = "Madison")
Jay <- Jay %>% mutate(author = "Jay")
HamiltonMadison <- HamiltonMadison %>% mutate(author = "Hamilton and Madison")
Unknown <- Unknown %>% mutate(author = "Unknown")

papers <- rbind(Hamilton, Madison, Jay, HamiltonMadison, Unknown)

word_counts <- papers %>% 
  unnest_tokens(
    input = "body_text", 
    output = "words", 
    token = "words"
    ) %>% 
  group_by(author) %>% 
  count(words, sort = TRUE) %>% 
  mutate(
    word_length = nchar(words),
    total_length = n * word_length
  )

# Group by author and calculate the average word length.
author_avg_word_length <- word_counts %>% 
  group_by(author) %>% 
  summarize(`average word length` = sum(total_length)/sum(n))

# Tokenize Federalist Paper No. 51 (author unknown) and write to a CSV. 
unknown_tokenized <- Unknown %>%
  unnest_tokens(
    input = "body_text", 
    output = "words", 
    token = "words"
  )

#write.csv(unknown_tokenized, file = "Unknown_Tokenized.csv")

Bootstrap samples from Federalist Paper No. 51 and place the results in a histogram to estimate authorship.

### Bootstrap samples from Federalist Paper No. 51.
# Take samples of 5 words and calculate their average word length. Place the 
# results on a histogram. On the histogram, place vertical lines showing the 
# average word length of the known authors and author combination. Use this to
# predict a potential author for Federalist Paper No. 51.

# Use replicate() to take 1,000 samples and store the results in a table.
averages_list <- replicate(
  1000,
  unknown_tokenized %>% 
    slice_sample(n = 5) %>% 
    summarize(avg_length = sum(nchar(words)) / 5)
  )

sample_averages <- averages_list %>% map_df(as_tibble)

# Make a histogram using sample_averages.
# Stylistic note: Initially, I wanted to have a legend displayed showing the 
# line color and its matching author. Doing so would require the use of the 
# function scale_color_manual(), which necessitates a column to reference.
# Since I am using an x-intercept, this is not the most practical choice.
ggplot(sample_averages, aes(x = value)) +
  geom_histogram(binwidth = 0.1) +
  geom_vline(xintercept = 4.923508, color = "black") +
  geom_vline(xintercept = 5.013823, color = "red") +
  geom_vline(xintercept = 4.903823, color = "darkgreen") +
  geom_vline(xintercept = 4.834774, color = "darkorange1") +
  geom_vline(
    aes(xintercept = mean(value)), 
    color = "blue", 
    linetype = "dashed", 
    size = 1.2
    ) +
  scale_x_continuous(breaks = seq(2, 10, 1)) +
  labs(
    title = "Bootstrapped Sample of Average Word Length from Federalist Paper No. 51",
    x = "Average Word Length",
    y = "Frequency",
    caption = " Black - Hamilton, Red - Hamilton and Madison, Green - Jay, Orange - Madison, Blue - Unknown"
    ) +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5))

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Zoom in to get a better view.
ggplot(sample_averages, aes(x = value)) +
  geom_histogram(binwidth = 0.1) +
  geom_vline(xintercept = 4.923508, color = "black", size = 1.2) +
  geom_vline(xintercept = 5.013823, color = "red", size = 1.2) +
  geom_vline(xintercept = 4.903823, color = "darkgreen", size = 1.2) +
  geom_vline(xintercept = 4.834774, color = "darkorange1", size = 1.2) +
  geom_vline(
    aes(xintercept=mean(value)), 
    color = "blue", 
    linetype ="dashed", 
    size = 1.2
    ) +
  scale_x_continuous(lim = c(4.7, 5.2)) +
  labs(
    title = "Bootstrapped Sample of Average Word Length from Federalist Paper No. 51",
    subtitle = "(Zoomed in for an Enhanced View)",
    x = "Average Word Length",
    y = "Frequency",
    caption = " Black - Hamilton, Red - Hamilton and Madison, Green - Jay, Orange - Madison, Blue - Unknown"
  ) +
  theme_classic() +
  theme(
    plot.title = element_text(hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5)
    )

## Warning: Removed 832 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Bootstrapping Federalist Papers

Kevin Straka

03-2025

We will need the following packages.

Web scrape the information.

Prep the data and find average word length.

Bootstrap samples from Federalist Paper No. 51 and place the results in a histogram to estimate authorship.

Observe that the bootstrapped average word length and actual average word

length for Federalist Paper No. 51 are almost identical. It is clear that

the blue dashed line (unknown author) and the black line (Hamilton) are

practically the same. Based on the average word length of the

bootstrapped samples, Alexander Hamilton is most likely the author of

Federalist Paper No. 51. (Keep in mind that average word length alone

is a very rough estimation of authorship).