Data Visualization Work Samples

# Load package(s)
library(tidyverse)
library(ggthemes)
library(patchwork)
library(sf)
library(tigris)
library(viridis)
library(statebins)
library(ggrepel)
library(grid)
library(jpeg)

# Read in the dataset(s)
ga_data <- read.csv("data/ga_election_data.csv")%>%
  janitor::clean_names()
ga_map <- counties(state = "GA", cb = TRUE) %>%
  janitor::clean_names()
load("data/US_income.rda")
load("data/tech_stocks.rda")
load("data/corruption.rda")
admin_data <- read.csv("data/NU_admission_data.csv")
steph <- read_delim(file = "data/stephen_curry_shotdata_2014_15.txt", delim = "|") %>%
  janitor::clean_names()

Steph Curry, the Greatest Shooter

# load image
court <- rasterGrob(readJPEG(source = "data/nbahalfcourt.jpg"),
  width = unit(1, "npc"), height = unit(1, "npc")
)

ggplot() +
  #fill grid with image
  annotation_custom(
    grob = court,
    xmin = -250, xmax = 250,
    ymin = -52, ymax = 418
  ) +
  coord_fixed() +
  xlim(250, -250) +
  ylim(-52, 418) +
  theme_void() +
  theme(
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank(),
    panel.grid.major.y = element_blank(),
    plot.title = element_text(face = "bold", size = 14)) +
  labs(title = "Stephen Curry",
       subtitle = "Shot Chart (2014-2015)",
       x = NULL,
       y = NULL) +
  geom_hex(data = steph, aes(x = loc_x, y = loc_y), 
           color = "grey", bins = 20, alpha = 0.7) +
  scale_fill_gradient(name = "Shot \nAttempts",
                      low = "yellow", high = "red",
                      breaks = c(0, 5, 10, 15),
                      labels = c(0, 5, 10, "15+"),
                      limits = c(0, 15),
                      na.value = "red"
                      )

Early voting in Georgia

ga_graph <- ga_data %>%
  mutate(prop_pre_eday = (absentee_by_mail_votes + advanced_voting_votes)/ total_votes) %>%
  select(-contains("_vote"))

biden_map <- ga_map %>%
  left_join(ga_graph, by = c("name" = "county")) %>%
  filter(candidate == "Joseph R. Biden")

trump_map <- ga_map %>%
  left_join(ga_graph, by = c("name" = "county")) %>%
  filter(candidate == "Donald J. Trump")

Dems <- ggplot(biden_map) +
  geom_sf(aes(fill = prop_pre_eday), show.legend = FALSE) +
  labs(title = "Joseph R. Biden", subtitle = "Democratic Nominee") +
  theme_map() +
  scale_fill_gradient2(high = "#5D3A9B",
                       mid = "white",
                       low = "#1AFF1A",
                       midpoint = 0.75,
                       breaks = c(0.5, 0.75, 1.00),
                       limits = c(0.5, 1.0),
                       labels = c("50%", "75%", "100%")) +
  theme(plot.title = element_text(face = "bold", size = 14),
        plot.subtitle = element_text(size = 12))

GOP <- ggplot(trump_map) +
  geom_sf(aes(fill = prop_pre_eday)) +
  labs(title = "Donald J. Trump", subtitle = "Republican Nominee") +
  theme_map() +
  scale_fill_gradient2(high = "#5D3A9B",
                       mid = "white",
                       low = "#1AFF1A",
                       midpoint = 0.75,
                       breaks = c(0.5, 0.75, 1.00),
                       limits = c(0.5, 1.0),
                       labels = c("50%", "75%", "100%")) +
  theme(plot.title = element_text(face = "bold", size = 14),
        plot.subtitle = element_text(size = 12),
        legend.position = c(1,1),
        legend.title = element_blank(),
        legend.justification = c(1, 1))

plot <- Dems + GOP

plot +
  plot_annotation( 
    title = 'Percentage of votes from early voting',
    caption = 'Georgia: 2020 US Presidential Election Results',
    theme = theme(plot.title = element_text(face = "bold", size = 24),
                  plot.caption = element_text(size = 12)))

Summary: It is clear that more Democratic voters voted early, with many areas over 80%. Among Republican voters, fewer people voted early, but still most counties had more than 50% of the votes came before election day.

Median Income in the U.S.

# Setting income levels
US_income <- mutate(
  US_income,
  income_bins = cut(
    ifelse(is.na(median_income), 25000, median_income),
    breaks = c(0, 40000, 50000, 60000, 70000, 80000),
    labels = c("< $40k", "$40k to $50k", 
               "$50k to $60k", "$60k to $70k", "> $70k"),
    right = FALSE
  )
)

Plot 1

ggplot(US_income)+
  geom_sf(aes(fill = income_bins),
          color = "grey80", size = 0.2)+
  scale_fill_viridis(discrete = TRUE)+
  theme_void()+
  labs(fill = "Median \nIncome")

Plot 2

ggplot(US_income)+
  geom_statebins(aes(fill = income_bins, state = name))+
  scale_fill_viridis(discrete = TRUE)+
  theme_statebins()+
  labs(fill = "Median \nIncome")

Tech Stocks

ggplot(data = tech_stocks, 
       aes(x = date, y = price_indexed, 
           color = company)) +
  geom_line(key_glyph = "timeseries") +
  scale_y_continuous(breaks = c(0, 100, 200, 300, 400, 500),
                     labels = scales::dollar,
                     position = "right") +
  scale_x_date(expand = c(0,0)) +
  lims(color = c("Facebook", "Alphabet", "Microsoft", "Apple")) +
  ylab(NULL) +
  xlab(NULL) +
  guides(color = guide_legend(override.aes = list(size = 1.3))) +
  labs(title = "Stock price, indexed") +
  theme_minimal() +
  theme(legend.position = c(0.75, 0.85),
        legend.title = element_blank())

Corruption and Development

corrupt_2015 <- corruption %>%
  filter(year == "2015")

label <- corrupt_2015 %>%
  filter(country %in% c("Argentina", "Chile", "China", "Japan", "Ghana", "Iraq", "Niger", "Singapore", "United States"))

ggplot(data = corrupt_2015,
       aes(x = cpi, y = hdi, color = region)) +
  geom_point(alpha = 0.6) + 
  geom_smooth(method = "lm",
              formula = "y ~ log(x)",
              se = FALSE, color = "grey40") +
  geom_text_repel(data = label, 
                  aes(label = country),
                  box.padding = 0.6,
                  min.segment.length = 0,
                  seed = 9876,
                  color = "black") +
  
  xlab("Corruption Perceptions Index, 2015 \n(100 = least corrupt)") +
  ylab("Human Development Index, 2015 \n(1.0 - most developed)") +
  labs(title = "Corruption and human development (2015)") +
  theme_minimal() +
  scale_color_brewer(palette = "Set1") +
  guides(color = guide_legend(override.aes = list(size = 3))) +
  theme(legend.title = element_blank(),
        legend.position = c(1,0),
        legend.justification = c(1,0))

Admissions at Northwestern

#bar graph data
bar_dat <- admin_data %>%
  mutate(
    #green bar
    cat_a = Applications - Admitted.students,
    #red
    cat_b = Admitted.students - Matriculants,
    #blue
    cat_c = Matriculants)%>%
  select(Year, contains("cat_"))%>%
  pivot_longer(cols = -Year,
               names_to = "category",
               values_to = "values")

bar_label <- admin_data %>%
  select(-contains("rate"))%>%
  pivot_longer(cols = -Year,
               names_to = "category",
               values_to = "values") %>%
  mutate(col_labels = prettyNum(values, big.mark = ","))

bar_plt <- ggplot(bar_dat, aes(Year, values)) +
  geom_col(aes(fill = category)) +
  theme_classic() +
  geom_text(data = bar_label,
            aes(label = col_labels),
            nudge_y = -300,
            size = 2.2,
            color = "white") +
  scale_y_continuous(
    name = "Applicants",
    limits = c(0,50000),
    breaks = seq(0,50000,5000),
    expand = c(0,0),
    labels = scales::label_comma()
    ) +
  scale_x_continuous(
    name = "Entering Year",
    breaks = 1999:2020,
    expand = c(0,0.25),
  ) +
  scale_fill_manual(
    name = NULL,
    values = c("#B6ACD1", "#836EAA", "#4E2A84"),
    labels = c("Applications", "Admitted \nstudents", "Matriculants")
  ) +
  theme(legend.position = c(0.5,1),
        legend.justification = c(0.5, 1),
        legend.direction = "horizontal") +
  labs(title = "Northwestern University \nUndergraduate Admissions 1999-2000")

rate_dat <- admin_data %>%
  select(Year, contains("rate")) %>%
  pivot_longer(cols = -Year,
               names_to = "category",
               values_to = "values")

rate_labels <- rate_dat %>%
  mutate(pct_label = str_c(values, "%"))

#yield plot
rate_plt <- ggplot(rate_dat, aes(Year, values)) +
  geom_line(aes(color = category)) +
  geom_point(aes(color = category, shape = category)) +
  theme_classic() +
  geom_text(data = rate_labels,
            aes(label = pct_label),
            nudge_y = 2.5,
            size = 1.8
            ) +
  scale_x_continuous(
    name = "Entering Year",
    breaks = 1999:2020,
    expand = c(0,0.5),
  ) +
  scale_y_continuous(
    name = "Rate",
    limits = c(0,60),
    breaks = seq(0,60,10),
    expand = c(0,0),
    labels = scales::label_percent(scale = 1)
    ) +
  scale_color_manual(
    name = NULL,
    values = c("#B6ACD1", "#836EAA"),
    labels = c("Admission Rate", "Yield Rate")
  ) +
  scale_shape_manual(
    name = NULL,
    values = c(16, 8),
    labels = c("Admission Rate", "Yield Rate")
  ) +
  theme(legend.position = c(0.5,1),
        legend.justification = c(0.5, 1),
        legend.direction = "horizontal") +
  labs(title = "Northwestern University \nUndergraduate Admissions 1999-2000")

bar_plt / rate_plt