Load tidyverse
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(dplyr)
Import data
vreg <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/voter-registration/new-voter-registrations.csv", header=TRUE)
Level the Month variable so that its in the right order (ie not alphabetical)
vreg$Month<-factor(vreg$Month,
levels=c("Jan", "Feb", "Mar", "Apr", "May"))
USE spread() FROM tidyr
vregYear<-vreg%>%
spread(Year, New.registered.voters)
RENAME THE COLUMNS
colnames(vregYear)<-c("Jurisdiction", "Month", "Y2016", "Y2020")
mutate() FROM dplyr()
vregChange<-vregYear%>%
mutate(change=Y2020-Y2016)
Re-Create Journey
Analyzing Structure
str(vregChange)
## 'data.frame': 53 obs. of 5 variables:
## $ Jurisdiction: chr "Arizona" "Arizona" "Arizona" "Arizona" ...
## $ Month : Factor w/ 5 levels "Jan","Feb","Mar",..: 1 2 3 4 1 2 3 4 1 2 ...
## $ Y2016 : int 25852 51155 48614 30668 87574 103377 174278 185478 17024 20707 ...
## $ Y2020 : int 33229 50853 31872 10249 151595 238281 176810 38970 20260 33374 ...
## $ change : int 7377 -302 -16742 -20419 64021 134904 2532 -146508 3236 12667 ...
summary(vregChange)
## Jurisdiction Month Y2016 Y2020
## Length:53 Jan:12 Min. : 2840 Min. : 589
## Class :character Feb:12 1st Qu.: 20460 1st Qu.: 10249
## Mode :character Mar:12 Median : 37028 Median : 29507
## Apr:12 Mean : 51956 Mean : 44491
## May: 5 3rd Qu.: 73627 3rd Qu.: 54053
## Max. :185478 Max. :238281
## change
## Min. :-146508
## 1st Qu.: -17866
## Median : -3789
## Mean : -7464
## 3rd Qu.: 1952
## Max. : 134904
I’m actually going to adjust the data a little further to create a column for positive and negative change in order to more easily fill columns based on positive or negative change.
vregChange_0 <- vregChange %>%
mutate(Zero = as.factor(change >= 0))
head(vregChange_0)
## Jurisdiction Month Y2016 Y2020 change Zero
## 1 Arizona Jan 25852 33229 7377 TRUE
## 2 Arizona Feb 51155 50853 -302 FALSE
## 3 Arizona Mar 48614 31872 -16742 FALSE
## 4 Arizona Apr 30668 10249 -20419 FALSE
## 5 California Jan 87574 151595 64021 TRUE
## 6 California Feb 103377 238281 134904 TRUE
Now to re-create the viz
ggplot(vregChange_0, aes(Month, change))+
geom_col()

ggplot(vregChange_0, aes(Month, change))+
geom_col()+
facet_wrap(~Jurisdiction)

ggplot(vregChange_0, aes(Month, change, fill = Zero))+
geom_col()+
facet_wrap(~Jurisdiction, scale = "free_y")

ggplot(vregChange_0, aes(Month, change, fill = Zero))+
geom_col()+
geom_hline(yintercept = 0)+
theme_minimal()+
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.y = element_blank(),
legend.text = element_blank(),
legend.title = element_blank(),
axis.text = element_text(color = "gray"),
axis.title = element_blank())+
scale_x_discrete(labels = c("Jan" = "Jan", "Feb" = " ", "Mar" = " ", "Apr" = " ", "May" = "May"))+
facet_wrap(~Jurisdiction, scale = "free_y")

Creating a new graphic
vregChange_0
## Jurisdiction Month Y2016 Y2020 change Zero
## 1 Arizona Jan 25852 33229 7377 TRUE
## 2 Arizona Feb 51155 50853 -302 FALSE
## 3 Arizona Mar 48614 31872 -16742 FALSE
## 4 Arizona Apr 30668 10249 -20419 FALSE
## 5 California Jan 87574 151595 64021 TRUE
## 6 California Feb 103377 238281 134904 TRUE
## 7 California Mar 174278 176810 2532 TRUE
## 8 California Apr 185478 38970 -146508 FALSE
## 9 Colorado Jan 17024 20260 3236 TRUE
## 10 Colorado Feb 20707 33374 12667 TRUE
## 11 Colorado Mar 25627 18990 -6637 FALSE
## 12 Colorado Apr 22204 6034 -16170 FALSE
## 13 Delaware Jan 3007 3276 269 TRUE
## 14 Delaware Feb 3629 3353 -276 FALSE
## 15 Delaware Mar 5124 2535 -2589 FALSE
## 16 Delaware Apr 3818 589 -3229 FALSE
## 17 District of Columbia Jan 2840 3334 494 TRUE
## 18 District of Columbia Feb 2954 3348 394 TRUE
## 19 District of Columbia Mar 4706 2225 -2481 FALSE
## 20 District of Columbia Apr 4157 1281 -2876 FALSE
## 21 District of Columbia May 5714 1925 -3789 FALSE
## 22 Florida Jan 50231 77466 27235 TRUE
## 23 Florida Feb 87351 109859 22508 TRUE
## 24 Florida Mar 73627 54872 -18755 FALSE
## 25 Florida Apr 52508 21031 -31477 FALSE
## 26 Georgia Jan 34952 38573 3621 TRUE
## 27 Georgia Feb 40976 55386 14410 TRUE
## 28 Georgia Mar 44150 26284 -17866 FALSE
## 29 Georgia Apr 37028 15484 -21544 FALSE
## 30 Illinois Jan 44040 44443 403 TRUE
## 31 Illinois Feb 99674 68455 -31219 FALSE
## 32 Illinois Mar 52782 47899 -4883 FALSE
## 33 Illinois Apr 76098 21332 -54766 FALSE
## 34 Maryland Jan 19580 21532 1952 TRUE
## 35 Maryland Feb 29122 20708 -8414 FALSE
## 36 Maryland Mar 40497 23864 -16633 FALSE
## 37 Maryland Apr 26655 10061 -16594 FALSE
## 38 Maryland May 5828 23488 17660 TRUE
## 39 North Carolina Jan 35213 111990 76777 TRUE
## 40 North Carolina Feb 84357 54053 -30304 FALSE
## 41 North Carolina Mar 58272 54807 -3465 FALSE
## 42 North Carolina Apr 73341 35484 -37857 FALSE
## 43 North Carolina May 29374 23517 -5857 FALSE
## 44 Texas Jan 132860 134559 1699 TRUE
## 45 Texas Feb 143795 130080 -13715 FALSE
## 46 Texas Mar 170607 129424 -41183 FALSE
## 47 Texas Apr 143199 34694 -108505 FALSE
## 48 Texas May 91205 35678 -55527 FALSE
## 49 Virginia Jan 20032 25934 5902 TRUE
## 50 Virginia Feb 36911 29507 -7404 FALSE
## 51 Virginia Mar 44171 31492 -12679 FALSE
## 52 Virginia Apr 20460 5467 -14993 FALSE
## 53 Virginia May 26239 8239 -18000 FALSE
I first considered that maybe the orientation of the facets based on “Jurisdiction” might need to be changed because it is slightly skewing the scale state by state. So, I thought maybe it would be better to color by state and still use the 0 intercept line to denote positive or negative change. It does make the graph a little “busier” because of the colors… I abbreviated the states to make more room on the x-axis. They are all on the same scale now, so the integrity of the representation of the numbers seem better, but maybe the point is percentage? For instance, the percentages might be close to the same, but California we’re dealing with +/-100,000 people, vs. Delaware, where we are dealing with 0 to -3000. In this graphic, at the very least one can see clearly that everywhere was up prior to COVID, and then end of February moving into COVID rates are significantly down everywhere.
ggplot(vregChange_0, aes(x = Jurisdiction, y = change, fill=Jurisdiction))+
geom_col()+
theme_minimal()+
scale_color_hue()+
geom_hline(yintercept = 0)+
theme(axis.line.x = element_blank(),
axis.text.x = element_text(angle = 90),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
axis.text = element_text(color = "gray"),
panel.grid.minor.y = element_blank())+
scale_x_discrete(labels = c("Arizona" = "AZ", "California" = "CA", "Colorado" = "CO", "Delaware" = "DE", "District of Columbia" = "DC", "Florida" = "FL", "Georgia" = "GA", "Illinois" = "IL", "Maryland" = "MD", "North Carolina" = "NC", "Texas" = "TX", "Virginia" = "VA"))+
facet_wrap(~Month, scale = "free_x")

Let’s try another idea…
ggplot(vregChange_0, aes(Month, change, fill = Jurisdiction))+
geom_col(position = "dodge")+
theme_minimal()+
theme(panel.grid.major = element_blank())+
geom_hline(yintercept = 0)

So… lets go back to my thoughts about comparing based on percent decline vs. number of registrations. That way at least we’re on the same scale. I do like the facet by state because grouping all the states together is a little too clustered and harder to differentiate. I’m going to create another couple of columns to calculate percent increase/decrease and a factor stating whether that change is positive or negative in order to visualize by color easily.
vregChange_0_percent <- vregChange_0 %>%
mutate(Percent = (Y2020-Y2016)/Y2020)
vregChange_0_percent <- vregChange_0_percent %>%
mutate(Zero_Percent = as.factor(Percent >= 0))
Faceted by Jurisdiction compared by % change
ggplot(vregChange_0_percent, aes(Month, Percent, fill = Zero_Percent))+
geom_col()+
geom_hline(yintercept = 0)+
theme_minimal()+
theme(panel.grid.major = element_blank(),
axis.title.x = element_blank(),
legend.text = element_blank(),
legend.title = element_blank(),
legend.key = element_blank())+
facet_wrap(~Jurisdiction, scale = "free_y")
