Data 502 - Midterm Part 3

Load tidyverse

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(ggplot2)
library(dplyr)

Import data

vreg <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/voter-registration/new-voter-registrations.csv", header=TRUE)

Level the Month variable so that its in the right order (ie not alphabetical)

vreg$Month<-factor(vreg$Month,
                   levels=c("Jan", "Feb", "Mar", "Apr", "May"))

USE spread() FROM tidyr

vregYear<-vreg%>%
spread(Year, New.registered.voters)

RENAME THE COLUMNS

colnames(vregYear)<-c("Jurisdiction", "Month", "Y2016", "Y2020")

mutate() FROM dplyr()

vregChange<-vregYear%>%
mutate(change=Y2020-Y2016)

Re-Create Journey

Analyzing Structure

str(vregChange)

## 'data.frame':    53 obs. of  5 variables:
##  $ Jurisdiction: chr  "Arizona" "Arizona" "Arizona" "Arizona" ...
##  $ Month       : Factor w/ 5 levels "Jan","Feb","Mar",..: 1 2 3 4 1 2 3 4 1 2 ...
##  $ Y2016       : int  25852 51155 48614 30668 87574 103377 174278 185478 17024 20707 ...
##  $ Y2020       : int  33229 50853 31872 10249 151595 238281 176810 38970 20260 33374 ...
##  $ change      : int  7377 -302 -16742 -20419 64021 134904 2532 -146508 3236 12667 ...

summary(vregChange)

##  Jurisdiction       Month        Y2016            Y2020       
##  Length:53          Jan:12   Min.   :  2840   Min.   :   589  
##  Class :character   Feb:12   1st Qu.: 20460   1st Qu.: 10249  
##  Mode  :character   Mar:12   Median : 37028   Median : 29507  
##                     Apr:12   Mean   : 51956   Mean   : 44491  
##                     May: 5   3rd Qu.: 73627   3rd Qu.: 54053  
##                              Max.   :185478   Max.   :238281  
##      change       
##  Min.   :-146508  
##  1st Qu.: -17866  
##  Median :  -3789  
##  Mean   :  -7464  
##  3rd Qu.:   1952  
##  Max.   : 134904

I’m actually going to adjust the data a little further to create a column for positive and negative change in order to more easily fill columns based on positive or negative change.

vregChange_0 <- vregChange %>%
  mutate(Zero = as.factor(change >= 0))

head(vregChange_0)

##   Jurisdiction Month  Y2016  Y2020 change  Zero
## 1      Arizona   Jan  25852  33229   7377  TRUE
## 2      Arizona   Feb  51155  50853   -302 FALSE
## 3      Arizona   Mar  48614  31872 -16742 FALSE
## 4      Arizona   Apr  30668  10249 -20419 FALSE
## 5   California   Jan  87574 151595  64021  TRUE
## 6   California   Feb 103377 238281 134904  TRUE

Now to re-create the viz

ggplot(vregChange_0, aes(Month, change))+
  geom_col()

ggplot(vregChange_0, aes(Month, change))+
  geom_col()+
  facet_wrap(~Jurisdiction)

ggplot(vregChange_0, aes(Month, change, fill = Zero))+
  geom_col()+
  facet_wrap(~Jurisdiction, scale = "free_y")

ggplot(vregChange_0, aes(Month, change, fill = Zero))+
  geom_col()+
  geom_hline(yintercept = 0)+
  theme_minimal()+
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.y = element_blank(),
        legend.text = element_blank(),
        legend.title = element_blank(),
        axis.text = element_text(color = "gray"),
        axis.title = element_blank())+
  scale_x_discrete(labels = c("Jan" = "Jan", "Feb" = " ", "Mar" = " ", "Apr" = " ", "May" = "May"))+
  facet_wrap(~Jurisdiction, scale = "free_y")

Creating a new graphic

vregChange_0

##            Jurisdiction Month  Y2016  Y2020  change  Zero
## 1               Arizona   Jan  25852  33229    7377  TRUE
## 2               Arizona   Feb  51155  50853    -302 FALSE
## 3               Arizona   Mar  48614  31872  -16742 FALSE
## 4               Arizona   Apr  30668  10249  -20419 FALSE
## 5            California   Jan  87574 151595   64021  TRUE
## 6            California   Feb 103377 238281  134904  TRUE
## 7            California   Mar 174278 176810    2532  TRUE
## 8            California   Apr 185478  38970 -146508 FALSE
## 9              Colorado   Jan  17024  20260    3236  TRUE
## 10             Colorado   Feb  20707  33374   12667  TRUE
## 11             Colorado   Mar  25627  18990   -6637 FALSE
## 12             Colorado   Apr  22204   6034  -16170 FALSE
## 13             Delaware   Jan   3007   3276     269  TRUE
## 14             Delaware   Feb   3629   3353    -276 FALSE
## 15             Delaware   Mar   5124   2535   -2589 FALSE
## 16             Delaware   Apr   3818    589   -3229 FALSE
## 17 District of Columbia   Jan   2840   3334     494  TRUE
## 18 District of Columbia   Feb   2954   3348     394  TRUE
## 19 District of Columbia   Mar   4706   2225   -2481 FALSE
## 20 District of Columbia   Apr   4157   1281   -2876 FALSE
## 21 District of Columbia   May   5714   1925   -3789 FALSE
## 22              Florida   Jan  50231  77466   27235  TRUE
## 23              Florida   Feb  87351 109859   22508  TRUE
## 24              Florida   Mar  73627  54872  -18755 FALSE
## 25              Florida   Apr  52508  21031  -31477 FALSE
## 26              Georgia   Jan  34952  38573    3621  TRUE
## 27              Georgia   Feb  40976  55386   14410  TRUE
## 28              Georgia   Mar  44150  26284  -17866 FALSE
## 29              Georgia   Apr  37028  15484  -21544 FALSE
## 30             Illinois   Jan  44040  44443     403  TRUE
## 31             Illinois   Feb  99674  68455  -31219 FALSE
## 32             Illinois   Mar  52782  47899   -4883 FALSE
## 33             Illinois   Apr  76098  21332  -54766 FALSE
## 34             Maryland   Jan  19580  21532    1952  TRUE
## 35             Maryland   Feb  29122  20708   -8414 FALSE
## 36             Maryland   Mar  40497  23864  -16633 FALSE
## 37             Maryland   Apr  26655  10061  -16594 FALSE
## 38             Maryland   May   5828  23488   17660  TRUE
## 39       North Carolina   Jan  35213 111990   76777  TRUE
## 40       North Carolina   Feb  84357  54053  -30304 FALSE
## 41       North Carolina   Mar  58272  54807   -3465 FALSE
## 42       North Carolina   Apr  73341  35484  -37857 FALSE
## 43       North Carolina   May  29374  23517   -5857 FALSE
## 44                Texas   Jan 132860 134559    1699  TRUE
## 45                Texas   Feb 143795 130080  -13715 FALSE
## 46                Texas   Mar 170607 129424  -41183 FALSE
## 47                Texas   Apr 143199  34694 -108505 FALSE
## 48                Texas   May  91205  35678  -55527 FALSE
## 49             Virginia   Jan  20032  25934    5902  TRUE
## 50             Virginia   Feb  36911  29507   -7404 FALSE
## 51             Virginia   Mar  44171  31492  -12679 FALSE
## 52             Virginia   Apr  20460   5467  -14993 FALSE
## 53             Virginia   May  26239   8239  -18000 FALSE

I first considered that maybe the orientation of the facets based on “Jurisdiction” might need to be changed because it is slightly skewing the scale state by state. So, I thought maybe it would be better to color by state and still use the 0 intercept line to denote positive or negative change. It does make the graph a little “busier” because of the colors… I abbreviated the states to make more room on the x-axis. They are all on the same scale now, so the integrity of the representation of the numbers seem better, but maybe the point is percentage? For instance, the percentages might be close to the same, but California we’re dealing with +/-100,000 people, vs. Delaware, where we are dealing with 0 to -3000. In this graphic, at the very least one can see clearly that everywhere was up prior to COVID, and then end of February moving into COVID rates are significantly down everywhere.

ggplot(vregChange_0, aes(x = Jurisdiction, y = change, fill=Jurisdiction))+
  geom_col()+
  theme_minimal()+
  scale_color_hue()+
  geom_hline(yintercept = 0)+
  theme(axis.line.x = element_blank(),
        axis.text.x = element_text(angle = 90),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        axis.text = element_text(color = "gray"),
        panel.grid.minor.y = element_blank())+
  scale_x_discrete(labels = c("Arizona" = "AZ", "California" = "CA", "Colorado" = "CO", "Delaware" = "DE", "District of Columbia" = "DC", "Florida" = "FL", "Georgia" = "GA", "Illinois" = "IL", "Maryland" = "MD", "North Carolina" = "NC", "Texas" = "TX", "Virginia" = "VA"))+
  facet_wrap(~Month, scale = "free_x")

Let’s try another idea…

ggplot(vregChange_0, aes(Month, change, fill = Jurisdiction))+
    geom_col(position = "dodge")+
  theme_minimal()+
  theme(panel.grid.major = element_blank())+
  geom_hline(yintercept = 0)

So… lets go back to my thoughts about comparing based on percent decline vs. number of registrations. That way at least we’re on the same scale. I do like the facet by state because grouping all the states together is a little too clustered and harder to differentiate. I’m going to create another couple of columns to calculate percent increase/decrease and a factor stating whether that change is positive or negative in order to visualize by color easily.

vregChange_0_percent <- vregChange_0 %>%
mutate(Percent = (Y2020-Y2016)/Y2020)

vregChange_0_percent <- vregChange_0_percent %>%
mutate(Zero_Percent = as.factor(Percent >= 0))

Faceted by Jurisdiction compared by % change

ggplot(vregChange_0_percent, aes(Month, Percent, fill = Zero_Percent))+
  geom_col()+
  geom_hline(yintercept = 0)+
  theme_minimal()+
  theme(panel.grid.major = element_blank(),
        axis.title.x = element_blank(),
        legend.text = element_blank(),
        legend.title = element_blank(),
        legend.key = element_blank())+
      facet_wrap(~Jurisdiction, scale = "free_y")

No facet, fill by Jurisdiction based on % change. Although… I can’t figure out with the intercept line at 0 is angled… and it’s too late to fix it.

ggplot(vregChange_0_percent, aes(x = Jurisdiction, y = Percent, fill=Zero_Percent))+
  geom_col()+
  theme_minimal()+
  scale_color_hue()+
  geom_hline(yintercept = 0)+
  theme(axis.line.x = element_blank(),
        axis.text.x = element_text(angle = 90),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        axis.text = element_text(color = "gray"),
        panel.grid.minor.y = element_blank())+
  scale_x_discrete(labels = c("Arizona" = "AZ", "California" = "CA", "Colorado" = "CO", "Delaware" = "DE", "District of Columbia" = "DC", "Florida" = "FL", "Georgia" = "GA", "Illinois" = "IL", "Maryland" = "MD", "North Carolina" = "NC", "Texas" = "TX", "Virginia" = "VA"))+
  facet_wrap(~Month, scale = "free_x")

ggplot(vregChange_0_percent, aes(Month, Percent, fill = Jurisdiction))+
  geom_col(position = "dodge")+
  geom_hline(yintercept = 0)+
  theme_minimal()+
  theme(panel.grid.major = element_blank())

Data 502 - Midterm Part 3

Rochelle Rafn

10/11/2021

Load tidyverse

Import data

Level the Month variable so that its in the right order (ie not alphabetical)

USE spread() FROM tidyr

RENAME THE COLUMNS

mutate() FROM dplyr()

Re-Create Journey

Analyzing Structure

I’m actually going to adjust the data a little further to create a column for positive and negative change in order to more easily fill columns based on positive or negative change.

Now to re-create the viz

Creating a new graphic

Let’s try another idea…

Faceted by Jurisdiction compared by % change

No facet, fill by Jurisdiction based on % change. Although… I can’t figure out with the intercept line at 0 is angled… and it’s too late to fix it.