library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#install.packages("ggthemes")
library(ggthemes)
# load data
vreg<- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/voter-registration/new-voter-registrations.csv",
header=TRUE)
# check it went smoothly
head(vreg)
## Jurisdiction Year Month New.registered.voters
## 1 Arizona 2016 Jan 25852
## 2 Arizona 2016 Feb 51155
## 3 Arizona 2016 Mar 48614
## 4 Arizona 2016 Apr 30668
## 5 Arizona 2020 Jan 33229
## 6 Arizona 2020 Feb 50853
# Level the Month variable so that its in the right order (ie not alphabetical)
vreg$Month<-factor(vreg$Month,
levels=c("Jan", "Feb", "Mar", "Apr", "May"))
# USE spread() FROM tidyr
vregYear<-vreg%>%
spread(Year, New.registered.voters)
# RENAME THE COLUMNS
colnames(vregYear)<-c("Jurisdiction", "Month", "Y2016", "Y2020")
# mutate() FROM dplyr()
vregChange<-vregYear%>%
mutate(change=Y2020-Y2016)
# double check data
head(vregChange)
## Jurisdiction Month Y2016 Y2020 change
## 1 Arizona Jan 25852 33229 7377
## 2 Arizona Feb 51155 50853 -302
## 3 Arizona Mar 48614 31872 -16742
## 4 Arizona Apr 30668 10249 -20419
## 5 California Jan 87574 151595 64021
## 6 California Feb 103377 238281 134904
ggplot(vregChange, aes(x=Month, y=change))+
geom_col()+
facet_wrap(~Jurisdiction)+
theme_fivethirtyeight()+
labs(title = "Voter registration dropped dramatically during the pandemic",
subtitle = "Difference in the number of newly registered voters for each month in 2020 compared to the same month in 2016",
caption = "SOURCE: CENTER FOR ELECTION INNOVATION AND RESEARCH")
Notes on my first attempt:
THEME: make background white, make title and subtitle text smaller in size, make state titles bold text, make y-axis scales different for each facet graph (and adjust y-axis tick labels accordingly?), keep only “Jan” and “May” for x-axis tick labels, change axis text font, get rid of vertical grid lines, keep only major y-axis grid lines
COLOR: create new dummy variable (1 for positive, 0 for negative change?) to assign to color aesthetic
OTHER: if possible find an additional lab() argument to insert sentence “Some states treat coters who move between counties within a state as new registrants because they’re unregistered from their old county and newly registered in the new one.” Also, maybe find a way to place a dotted enclosure on graphs where May data was not available. If I want to be SUPER exact I would also find a way to place “FiveThirtyEight” in the bottom left corner, but that doesn’t feel as necessary since this is just a recreation.
# add column for color aesthetic
myvreg<- vregChange%>%
mutate(poschange = change>= 0)
head(myvreg)
## Jurisdiction Month Y2016 Y2020 change poschange
## 1 Arizona Jan 25852 33229 7377 TRUE
## 2 Arizona Feb 51155 50853 -302 FALSE
## 3 Arizona Mar 48614 31872 -16742 FALSE
## 4 Arizona Apr 30668 10249 -20419 FALSE
## 5 California Jan 87574 151595 64021 TRUE
## 6 California Feb 103377 238281 134904 TRUE
# to address the scientific notation happening with Texas facet
options(scipen=999)
ggplot(myvreg, aes(x=Month, y=change, fill=poschange))+
geom_col()+
facet_wrap(~Jurisdiction, scales = "free_y")+
scale_x_discrete(labels = c("Jan.","","","","May"))+
scale_y_continuous(breaks=waiver(), n.breaks=4)+
theme_fivethirtyeight()+
labs(title = "Voter registration dropped dramatically during the pandemic",
subtitle = "Difference in the number of newly registered voters for each month in 2020 compared to the same month in 2016",
caption = "SOURCE: CENTER FOR ELECTION INNOVATION AND RESEARCH")+
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.background = element_rect(color="white"),
plot.background = element_rect(color="white"),
legend.position = "none",
plot.title = element_text(size=10, hjust = 0.5),
plot.subtitle = element_text(size=8, hjust=0.5),
strip.text.x = element_text(face = 'bold', size = 8),
plot.caption = element_text(size=5, color = "grey"),
axis.text.x = element_text(color="grey"),
axis.text.y = element_text(color="grey", size = 8))
This is my final graphic. If I wanted to be more exact to the original graphic, I would:
format the y-axis tick labels such that 2000 was 2K
work harder to override the background color to be white (but the fivethirtyeight theme has a pretty accurate format with the exception of the background color not being white so I’m not too worried about it)
mess around more with the y-axis scale (I played around with the number of n.breaks and 4 gave me the output closest to the original graphic– however, the axis labels for Arizona, Delaware, DC, Georgia, Illinois and North Carolina have more labels than the original while the axis labels for Texas have different values labelled, but overall they are still accurate and delivering the same information. It makes me wonder if the creator of the original graphic used a different method.)
try to find out how to enclose the regions with no available data.
But overall, I think this is sufficient for the goal of this recreation!
Now I will create an alternate visual! My first thought is a slope graph, because I like them (I think they are super intuitive and easy to read!) and they’re great at showing change between two dates, which is exactly the kind of data we have.
#install.packages("esquisse")
library(esquisse)
#esquisser() -- to play around a bit
ggplot(vreg) +
aes(x = as.numeric(Year), y = New.registered.voters,
group = Jurisdiction, color=Jurisdiction) +
geom_line() +
geom_point(size=1)+
#geom_label(aes(label=Jurisdiction), size=2)+
facet_grid(~Month)+
theme_wsj()+
scale_x_continuous(labels=c(" ","2016","","","","2020", " "), expand=c(.2,.2) )+
scale_y_continuous(n.breaks=3, labels = c("0","100K","200K"))+
labs(title = "Spring 2020 sees a drop in voter registration compared to 2016",
y= "New voter registrations",
subtitle = "With a national state of emergency declared in March, voter registration felt the impact of the pandemic",
caption = "SOURCE: CENTER FOR ELECTION INNOVATION AND RESEARCH")+
theme(strip.text.x = element_text(face = 'bold', size = 8),
axis.ticks=element_blank(),
axis.text.x=element_text(size=6),
axis.text.y=element_text(size=6),
axis.title.y= element_text(size=8, vjust =5, face='bold'),
plot.title = element_text(size=12),
axis.ticks.x = element_blank(),
legend.title = element_text(size=8, face = 'bold'),
legend.text = element_text(size=7),
legend.position = "bottom",
plot.subtitle = element_text(size=7),
panel.grid.major.y = element_line(color="grey"),
plot.caption = element_text(size=8, face='italic', color='darkgrey'))
While it’s easy to think of a million critiques of any graphic, this is what I ended up with. A slope chart to show the difference in voter registrations between 2016 and 2020, faceting by month. Color is assigned to jurisdiction so one could identify a particular jurisdiction if they wanted to, but the important story here is the greater trends in the data across jurisdictions. I also wanted to include in my subtitle that while the pandemic was developing in Jan and Feb, it was in March that a national state of emergency was declared. It was March when things really changed in the USA, and I think this faceted slope graph shows March as a turning point. Finally, the color/design choices speak to my personal style which is always fun to incorporate since I’m not making this for any particular news/media outlet/company.