library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)
library(tidyr)

url <- "http://dl.tufts.edu/file_assets/generic/tufts:MS115.003.001.00001/0"
if (!file.exists("all-votes.tsv")) {
  download.file(url, "nnv-all-votes.zip")
  unzip("nnv-all-votes.zip", files = "all-votes.tsv")
}

nnv <- read_tsv("all-votes.tsv")

names(nnv) <- names(nnv) %>% 
  str_to_lower() %>% 
  str_replace_all("\\.", "") %>% 
  str_replace_all("\\s", "_")

nnv <- nnv %>% 
  mutate(year = str_extract(date, "\\d{4}") %>% as.integer())

Part One

A New Nation Votes is a dataset that is still in the process of being digitized and is the result of the efforts of Philip Lampi. The dataset consists of election results from the early Republic, broken down by state, year, office, jurisidiction, party, and election type. Currently, 18040 elections have been digitized, which is roughly 60% of Lampi’s overall collection. The election data was culled from contemporary sources, including newspapers and election ballots, in addition to nineteenth century county histories. The dataset is 107 MB, and contains 659, 625 observations of 25 variables.

summary(nnv)
##       id                date               type          
##  Length:659625      Length:659625      Length:659625     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##   iteration            office          office_scope      
##  Length:659625      Length:659625      Length:659625     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##      role            role_scope           state          
##  Length:659625      Length:659625      Length:659625     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##   territory             city              county         
##  Length:659625      Length:659625      Length:659625     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##    district             town             township        
##  Length:659625      Length:659625      Length:659625     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##      ward              parish          populated_place   
##  Length:659625      Length:659625      Length:659625     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##    hundred            borough              name          
##  Length:659625      Length:659625      Length:659625     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##    name_id          affiliation        affiliation_id          vote       
##  Length:659625      Length:659625      Length:659625      Min.   :     0  
##  Class :character   Class :character   Class :character   1st Qu.:    16  
##  Mode  :character   Mode  :character   Mode  :character   Median :    64  
##                                                           Mean   :   305  
##                                                           3rd Qu.:   186  
##                                                           Max.   :128519  
##                                                           NA's   :60996   
##       year     
##  Min.   :1787  
##  1st Qu.:1802  
##  Median :1810  
##  Mean   :1810  
##  3rd Qu.:1819  
##  Max.   :1826  
## 

What kinds of elections were there, and how many of each kind of election?

nnv %>% count(type)
## Source: local data frame [6 x 2]
## 
##                  type      n
##                 (chr)  (int)
## 1             General 620179
## 2        Legislastive     11
## 3         Legislative  24576
## 4             Special  14770
## 5    Special Election     14
## 6 Special Legislative     75
nnv %>% 
  ggplot(aes(x = type)) +
  geom_bar(stat = "count")

Using the count function, it is evident that there are six types of elections represented in the dataset: general, legislative, special, legislastive, special legislative, and special election. I am going to assume that ‘legislastive’ is a spelling error. I am also curious as to the difference between ‘special’ and ‘special election.’ The overwhelming majority of the elections were general elections.

How many candidates are represented and how often do they appear?

nnv %>% 
  count(name, name_id) %>% 
  ungroup() %>% 
  arrange(desc(n))
## Source: local data frame [41,738 x 3]
## 
##                 name name_id     n
##                (chr)   (chr) (int)
## 1       Caleb Strong  SC0023  6282
## 2   William Phillips  PW0081  6136
## 3     Elbridge Gerry  GE0049  5420
## 4             others    null  3993
## 5     James Sullivan  SJ0366  3930
## 6      William Heath  HW0171  3910
## 7  Edward H. Robbins  RE0018  3693
## 8         scattering    null  3650
## 9       Samuel Adams  AS0022  3390
## 10        Moses Gill  GM0022  3298
## ..               ...     ...   ...

Using the count, ungroup, and arrange functions, there are over 41 thousand candidates represented in the dataset. Caleb Strong is the candidate who is most represented in the dataset, as he appaears over six thousand times.

Which parties?

nnv_aff <- nnv %>% 
  filter(affiliation != "null")

nnv_aff %>% count(affiliation)
## Source: local data frame [178 x 2]
## 
##                         affiliation     n
##                               (chr) (int)
## 1  1st Ticket Democratic Republican    80
## 2  2nd Ticket Democratic Republican    80
## 3                           Adamite     4
## 4                    Administration    24
## 5             Administration Ticket     8
## 6              Against New Election    80
## 7                          American    40
## 8               American Republican   102
## 9                   Anit-Republican     6
## 10                     Anti Federal    32
## ..                              ...   ...

Using the affiliation variable, there are 178 affiliations, discounting null and NA.

What does each row in the dataset represent?

Each row is a particular candidate running in a particular election in a particular year, and sometimes in a specific region.

Which years are in the dataset, and how many elections are there in each year?

nnv <- nnv %>% 
  mutate(year = str_extract(date, "\\d{4}") %>% as.integer())

nnv %>% 
  group_by(year) %>% 
  summarise(unique_elements = n_distinct(id))
## Source: local data frame [40 x 2]
## 
##     year unique_elements
##    (int)           (int)
## 1   1787              69
## 2   1788             159
## 3   1789             186
## 4   1790             168
## 5   1791             156
## 6   1792             194
## 7   1793             167
## 8   1794             243
## 9   1795             202
## 10  1796             357
## ..   ...             ...

Within the dataset, the years 1787 through 1826 are represented. 1824 had the most elections out of those years, topping out at 1221. The final year of the dataset, 1826, had the lowest number of elections at 6.

Which states are represented?

nnv_state <- nnv %>%
  filter(state != "null") 

nnv_state %>% count(state)
## Source: local data frame [25 x 2]
## 
##          state     n
##          (chr) (int)
## 1      Alabama  2422
## 2  Connecticut  5250
## 3     Delaware 10224
## 4      Georgia  8490
## 5     Illinois  4805
## 6      Indiana  4084
## 7     Kentucky  8951
## 8    Louisiana  1610
## 9        Maine 10840
## 10    Maryland 29839
## ..         ...   ...

There are 25 states represented, discounting NA and null. NY had the least number of elections–only ten–out of all the states represented, although there is a great possibility that this was a mistake on the part of the researcher who input the data, as there is also data for New York. The state with the highest number of elections is Massachusetts, which had 157732 elections.

Part Two

nnv_loudoun <- nnv %>% 
  filter(state == "Virginia", county == "Loudoun", office == "U.S. House of Representatives")

nnv_loudoun %>% count(name) 
## Source: local data frame [21 x 2]
## 
##                  name     n
##                 (chr) (int)
## 1  Armistead T. Mason     2
## 2          Arthur Lee     1
## 3             Bushrod     1
## 4   Charles F. Mercer     5
## 5     John Littlejohn     1
## 6           John Love     2
## 7           John Pope     1
## 8     John Washington     1
## 9         Joseph Lane     1
## 10       Joseph Lewis     2
## ..                ...   ...
nnv_loudoun <- nnv_loudoun %>%
  filter(affiliation != "null")

nnv_loudoun %>% 
  ggplot(aes(x = affiliation)) +
  geom_bar(stat = "count") +
  labs(title = "number of votes cast by party in Loudoun", 
        x = "party affiliation", 
        y = "number of votes")

nnv_loudoun %>% 
  group_by(affiliation, year) %>% 
  summarize(vote = sum(vote, na.rm = TRUE)) %>% 

  ggplot(aes(x = year, y = vote, color = affiliation)) +
  geom_line() +
  labs(title = "total votes per year in Loudoun County", 
        x = "year", 
        y = "number of votes")

I wanted to examine data from my home county of Loudoun. Between the years of 1797 and 1825, 21 people ran for the US House of Representatives in Virginia. Because I wanted to examine the number of votes placed per party, I filtered out all votes where the affiliation was listed as null. The amount of votes placed for Federalist and Republican candidates were almost equal, although Federalist candidates ultimately received the most votes. I then created a line chart comparing the total votes by party per year in Loudoun. I wanted to compare the results of Loudoun a county close by, so I chose Fairfax.

nnv_fairfax <- nnv %>% 
  filter(state == "Virginia", county == "Fairfax", office == "U.S. House of Representatives")

nnv_fairfax %>% count(name) 
## Source: local data frame [22 x 2]
## 
##                  name     n
##                 (chr) (int)
## 1  Armistead T. Mason     2
## 2          Arthur Lee     1
## 3             Bushrod     1
## 4   Charles F. Mercer     5
## 5     John Littlejohn     1
## 6           John Love     2
## 7           John Pope     1
## 8     John Washington     1
## 9         Joseph Lane     1
## 10       Joseph Lewis     2
## ..                ...   ...
nnv_fairfax <- nnv_fairfax %>%
  filter(affiliation != "null")

nnv_fairfax %>% 
  ggplot(aes(x = affiliation)) +
  geom_bar(stat = "count") +
  labs(title = "number of votes cast by party in Fairfax", 
        x = "party affiliation", 
        y = "number of votes")

nnv_fairfax %>% 
  group_by(affiliation, year) %>% 
  summarize(vote = sum(vote, na.rm = TRUE)) %>% 

  ggplot(aes(x = year, y = vote, color = affiliation)) +
  geom_line() +
  labs(title = "total votes per year in Fairfax County", 
        x = "year", 
        y = "number of votes")

Fairfax County had data that looked similar to Loudoun’s: twenty-two candidates ran for the House of Representatives, and Federalists had just barely surpassed Republicans in total number of votes. Similar to Loudoun, in 1811 there was a dip in the number of Republican votes; however, there was not a dip in the number of Federalist votes as there had been in Loudoun. In general, it looks as if Federalists fared better overall in Loudoun than they did in Fairfax. Republicans in Fairfax received a large number of votes prior to 1800, and from 1800 to about 1811 votes for their party steadily decreased. They gained momentum again in 1811, had another dip in votes right after 1820, and then in the next years there was an uptick in votes. Votes for Republicans in Loudoun, however, steadily increased after 1811.

nnv_washington <- nnv %>% 
  filter(state == "Virginia", county == "Washington", office == "U.S. House of Representatives")

nnv_washington %>% count(name) 
## Source: local data frame [15 x 2]
## 
##                  name     n
##                 (chr) (int)
## 1       Abraham Trigg     6
## 2         Abram Trigg     1
## 3     Alexander Smyth     4
## 4        Andrew Moore     2
## 5     Arthur Campbell     2
## 6     Benjamin Estill     3
## 7      Daniel Sheffey     5
## 8     Edward Campbell     1
## 9     Francis Preston     4
## 10     George Hancock     1
## 11    Joseph Crockett     1
## 12            Mathews     1
## 13  William A. Graham     1
## 14    William Preston     1
## 15 Zachariah Johnston     1
nnv_washington <- nnv_washington %>%
  filter(affiliation != "null")

nnv_washington %>% 
  ggplot(aes(x = affiliation)) +
  geom_bar(stat = "count") +
  labs(title = "number of votes cast by party in Washington", 
        x = "party affiliation", 
        y = "number of votes")