library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)
library(tidyr)

url <- "http://dl.tufts.edu/file_assets/generic/tufts:MS115.003.001.00001/0"
if (!file.exists("all-votes.tsv")) {
  download.file(url, "nnv-all-votes.zip")
  unzip("nnv-all-votes.zip", files = "all-votes.tsv")
}

nnv <- read_tsv("all-votes.tsv")

names(nnv) <- names(nnv) %>% 
  str_to_lower() %>% 
  str_replace_all("\\ ", "_") 

Part One

A New Nation Votes is a dataset that is still in the process of being digitized and is the result of the efforts of Philip Lampi. The dataset consists of election results from the early Republic, broken down by state, year, office, jurisidiction, party, and election type. Currently, 18040 elections have been digitized, which is roughly 60% of Lampi’s overall collection. The election data was culled from contemporary sources, including newspapers and election ballots, in addition to nineteenth century county histories. The dataset is 107 MB, and contains 659, 625 observations of 25 variables.

summary(nnv)
##       id                date               type          
##  Length:659625      Length:659625      Length:659625     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##   iteration            office          office_scope      
##  Length:659625      Length:659625      Length:659625     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##      role            role_scope           state          
##  Length:659625      Length:659625      Length:659625     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##   territory             city              county         
##  Length:659625      Length:659625      Length:659625     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##    district             town             township        
##  Length:659625      Length:659625      Length:659625     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##      ward              parish          populated_place   
##  Length:659625      Length:659625      Length:659625     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##    hundred            borough              name          
##  Length:659625      Length:659625      Length:659625     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##    name_id          affiliation        affiliation_id          vote       
##  Length:659625      Length:659625      Length:659625      Min.   :     0  
##  Class :character   Class :character   Class :character   1st Qu.:    16  
##  Mode  :character   Mode  :character   Mode  :character   Median :    64  
##                                                           Mean   :   305  
##                                                           3rd Qu.:   186  
##                                                           Max.   :128519  
##                                                           NA's   :60996

What kinds of elections were there, and how many of each kind of election?

nnv %>% count(type)
## Source: local data frame [6 x 2]
## 
##                  type      n
##                 (chr)  (int)
## 1             General 620179
## 2        Legislastive     11
## 3         Legislative  24576
## 4             Special  14770
## 5    Special Election     14
## 6 Special Legislative     75
nnv %>% 
  ggplot(aes(x = type)) +
  geom_bar(stat = "count")

Using the count function, it is evident that there are six types of elections represented in the dataset: general, legislative, special, legislastive, special legislative, and special election. I am going to assume that ‘legislastive’ is a spelling error. I am also curious as to the difference between ‘special’ and ‘special election.’ The overwhelming majority of the elections were general elections.

How many candidates are represented and how often do they appear?

nnv %>% 
  count(name, name_id) %>% 
  ungroup() %>% 
  arrange(desc(n))
## Source: local data frame [41,738 x 3]
## 
##                 name name_id     n
##                (chr)   (chr) (int)
## 1       Caleb Strong  SC0023  6282
## 2   William Phillips  PW0081  6136
## 3     Elbridge Gerry  GE0049  5420
## 4             others    null  3993
## 5     James Sullivan  SJ0366  3930
## 6      William Heath  HW0171  3910
## 7  Edward H. Robbins  RE0018  3693
## 8         scattering    null  3650
## 9       Samuel Adams  AS0022  3390
## 10        Moses Gill  GM0022  3298
## ..               ...     ...   ...

Using the count, ungroup, and arrange functions, there are over 41 thousand candidates represented in the dataset. Caleb Strong is the candidate who is most represented in the dataset, as he appaears over six thousand times.

Which parties?

nnv_aff <- nnv %>% 
  filter(affiliation != "null")

nnv_aff %>% count(affiliation)
## Source: local data frame [178 x 2]
## 
##                         affiliation     n
##                               (chr) (int)
## 1  1st Ticket Democratic Republican    80
## 2  2nd Ticket Democratic Republican    80
## 3                           Adamite     4
## 4                    Administration    24
## 5             Administration Ticket     8
## 6              Against New Election    80
## 7                          American    40
## 8               American Republican   102
## 9                   Anit-Republican     6
## 10                     Anti Federal    32
## ..                              ...   ...

Using the affiliation variable, there are 178 affiliations, discounting null and NA.

What does each row in the dataset represent?

Each row is a particular candidate running in a particular election in a particular year, and sometimes in a specific region.

Which years are in the dataset, and how many elections are there in each year?

nnv <- nnv %>% 
  mutate(year = str_extract(date, "\\d{4}") %>% as.integer())

nnv %>% 
  group_by(year) %>% 
  summarise(unique_elements = n_distinct(id))
## Source: local data frame [40 x 2]
## 
##     year unique_elements
##    (int)           (int)
## 1   1787              69
## 2   1788             159
## 3   1789             186
## 4   1790             168
## 5   1791             156
## 6   1792             194
## 7   1793             167
## 8   1794             243
## 9   1795             202
## 10  1796             357
## ..   ...             ...

Within the dataset, the years 1787 through 1826 are represented. 1824 had the most elections out of those years, topping out at 1221. The final year of the dataset, 1826, had the lowest number of elections at 6.

Which states are represented?

nnv_state <- nnv %>%
  filter(state != "null") 

nnv_state %>% count(state)
## Source: local data frame [25 x 2]
## 
##          state     n
##          (chr) (int)
## 1      Alabama  2422
## 2  Connecticut  5250
## 3     Delaware 10224
## 4      Georgia  8490
## 5     Illinois  4805
## 6      Indiana  4084
## 7     Kentucky  8951
## 8    Louisiana  1610
## 9        Maine 10840
## 10    Maryland 29839
## ..         ...   ...

There are 25 states represented, discounting NA and null. NY had the least number of elections–only ten–out of all the states represented, although there is a great possibility that this was a mistake on the part of the researcher who input the data, as there is also data for New York. The state with the highest number of elections is Massachusetts, which had 157732 elections.

Part Two

nnv_loudoun <- nnv %>% 
  filter(state == "Virginia", county == "Loudoun", office == "U.S. House of Representatives") %>%
  group_by(affiliation) %>% 
  select(-territory, -township, -ward, -parish, -borough, -name_id, -affiliation_id)

nnv_loudoun <- nnv_loudoun %>%
  filter(affiliation != "null") 

nnv_loudoun$name %>% 
  unique()
##  [1] "Charles F. Mercer"  "Sydnor Bailey"      "Armistead T. Mason"
##  [4] "Joseph Lewis, Jr."  "John Love"          "John Littlejohn"   
##  [7] "Richard Brent"      "Leven Powell"       "John Washington"   
## [10] "Bushrod"            "Joseph Lewis"       "William Tyler"     
## [13] "William Ellzey"     "Roger West"
nnv_loudoun %>% 
ggplot(aes(x = affiliation)) +
  geom_bar(stat = "count") +
   labs(title = "Number of Votes Cast by Party in Loudoun County, Virginia",
       x = "party affiliation",
       y = "number of votes")

I wanted to examine data from my home county of Loudoun. This is a very small dataset, with only 29 observations. Between the years of 1797 and 1825, 14 people ran for the US House of Representatives in Virginia. The amount of votes for placed for the Federalists and Republicans candidates were almost equal, although Federalist candidates ultimately received the most votes.

ggplot(nnv_loudoun, aes(x = year, y = vote)) +
  geom_line() +
   labs(title = "Votes from Loudoun County for the U.S. House of Representatives",
       x = "year",
       y = "number of votes")

Using a line graph to show the election results from Loudoun County for the US House of Representatives reveals the years in which voters were most and least active. The activity of the voters was undoubtedly influenced by what types of elections were taking place during those years.

nnv <- nnv %>% 
  mutate(year = str_extract(date, "\\d{4}") %>% as.integer())
ny_gov <- nnv %>% 
  filter(name_id != "null") %>% 
  filter(office == "Governor", state == "New York") %>% 
  filter(is.na(district), is.na(county), is.na(city)) %>% 
  select(name, name_id, vote, year, affiliation) %>% 
  group_by(year) %>% 
  arrange(desc(vote))

ggplot(ny_gov, aes(x = name, y = vote)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ year) +
  scale_y_log10() +
   labs(title = "Votes for governor in New York",
       x = "year",
       y = "number of votes")
## Warning: Removed 2 rows containing missing values (position_stack).

```

ny_gov_2 <- nnv %>% 
  filter(name_id != "null") %>% 
  filter(office == "Governor", state == "New York", year == "1816") %>% 
  filter(is.na(district), is.na(county), is.na(city)) %>% 
  select(name, name_id, vote, year, affiliation) %>% 
  group_by(year) %>% 
  arrange(desc(vote))

ggplot(ny_gov_2, aes(x = name, y = vote, color = affiliation)) +
  geom_bar(stat = "identity") +
   labs(title = "Votes for Governor of New York in 1816",
       x = "name of candidate",
       y = "number of votes", 
       color = "party")

I wanted to examine a state that had a large number governor observations, which led me to New York. I whittled down the data to show the state-wide votes for each candidate for governor for each year. I grouped by year and had the data arranged in descending order by vote. I used a facet wrapped graph to show the I visualized the data in a facet wrap graph. Because the labels along the X axis were distorted, this graph was not as helpful as it could have been. I then chose a year that had a relatively high number of votes to look more closely at one election year, since the facet wrap did not display in a readable way.