Task 1: Combining plots

library(tidyverse)
library(patchwork)
library(broom) 
library(marginaleffects)

results_2016 <- read_csv("data/results_2016.csv")

Make 2–3 plots of anything you want from the results_2016 data (histogram, density, boxplot, scatterplot, whatever) and combine them with patchwork. Look at the documentation to see fancy ways of combining them, like having two rows inside a column.

plot_1 <- ggplot(results_2016, aes(x = median_age)) + 
  geom_histogram(binwidth = 1, 
                 color = "white", 
                 fill = "dark green", 
                 boundary = 30) + 
  labs(title = "Median Age Distribution of Voters in the 2016 US Elections", 
       x = "Median Age", 
       y = NULL) 
plot_1

plot_2 <- ggplot(results_2016, aes(x =  median_age, y = percent_dem)) + 
  geom_point(alpha = 0.5, position = position_jitter(), color = "blue") + 
   geom_smooth(color = "green") + 
  labs(x = "Median Age", 
       y = "Democrats Percent")

plot_2

plot_3 <- ggplot(results_2016, aes(x = median_age, y = percent_gop)) + 
  geom_point(alpha = 0.5, position = position_jitter(), color = "red") + 
  geom_smooth(color = "green") + 
  labs(x = "Median Age", 
       y = "Republicans Percent")

plot_3

patchwork <- plot_1 / (plot_2 | plot_3)
patchwork & theme_bw()

Task 2: Visualizing regression

Coefficient plot

Use the results_2016 data to create a model that predicts the percent of Democratic votes in a precinct based on age, race, income, rent, and state (hint: the formula will look like this: percent_dem ~ median_age + percent_white + per_capita_income + median_rent + state)

Use tidy() in the broom package and geom_pointrange() to create a coefficient plot for the model estimates. You’ll have 50 rows for all the states, and that’s excessive for a plot like this, so you’ll want to filter out the state rows. You can do that by adding this:

reg_model <- lm(percent_dem ~ median_age + percent_white + per_capita_income + 
                median_rent + state, 
                data = results_2016) 

model_tidied <- tidy(reg_model, conf.int = TRUE) %>%
  filter(!str_detect(term, "state")) %>% 
  filter(term != "(Intercept)")

ggplot(model_tidied,
       aes(x = estimate, y = term)) +
  geom_vline(xintercept = 0, color = "red", linetype = "dotted") +
  geom_pointrange(aes(xmin = conf.low, xmax = conf.high)) + 
  labs(x = "Coefficient estimate", y = NULL) +
  theme_minimal()

The str_detect() function looks for the characters “state” in the term column. The ! negates it. This is thus saying “only keep rows where the word ‘state’ is not in the term name”.

You should also get rid of the intercept (filter(term != "(Intercept)")).

Predicted values

Show what happens to percent_dem as one (or more) of your model’s variables changes. To make life easy, refer to the “Predicted values and marginal effects in 2022” section in this session’s example and use predictions() rather than creating your own newdata data set by hand. You’ll do something like this (assuming you’re manipulating per_capita_income; try using a different variable when you do the assignment, though):

my_predictions <- predictions(
  reg_model,
  newdata = datagrid(median_age = seq(20, 70, by = 10),
                     state = "Georgia")) 

my_predictions %>%
  select(median_age, predicted, std.error, conf.low, conf.high)

##   median_age predicted std.error conf.low conf.high
## 1         20  22.31998 0.9885077 20.38254  24.25742
## 2         30  23.98251 0.7390865 22.53392  25.43109
## 3         40  25.64503 0.6209508 24.42799  26.86208
## 4         50  27.30756 0.7036623 25.92841  28.68672
## 5         60  28.97009 0.9353775 27.13679  30.80340
## 6         70  30.63262 1.2348892 28.21228  33.05296

my_predictions_plot_1 <- ggplot(my_predictions, aes(x = median_age, y = predicted)) +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high),
              fill = "#0015BC", alpha = 0.5) + 
  geom_line(size = 1, color = "dark blue") +
  labs(x = "Median Age", y = "Predicted Democrats Percent") +
  theme_minimal()

ggsave(my_predictions_plot_1, filename = "pred_plot_1.png", width = 8, height = 4.5)

Plot your varied variable on the x-axis, the fitted values (predicted) on the y-axis, show the relationship with a line, and add a ribbon to show the 95% confidence interval.

Bonus task 1! Correlograms

This is entirely optional but might be fun.

For extra fun times, if you feel like it, create a correlogram heatmap, either with geom_tile() or with points sized by the correlation. Use any variables you want from results_2016.

correlate <- results_2016 %>% 
  select(median_age, percent_white, per_capita_income, median_rent) %>% 
  cor()

correlate

##                   median_age percent_white per_capita_income median_rent
## median_age                 1            NA                NA          NA
## percent_white             NA             1                NA          NA
## per_capita_income         NA            NA                 1          NA
## median_rent               NA            NA                NA           1

correlate[lower.tri(correlate)] <- NA
correlate

##                   median_age percent_white per_capita_income median_rent
## median_age                 1            NA                NA          NA
## percent_white             NA             1                NA          NA
## per_capita_income         NA            NA                 1          NA
## median_rent               NA            NA                NA           1

correlate_long <- correlate %>% 
  as.data.frame() %>% 
  rownames_to_column("measure2") %>% 
  pivot_longer(cols = -measure2,
               names_to = "measure1",
               values_to = "cor") %>% 
  mutate(nice_cor = round(cor, 2)) %>% 
  mutate(measure1 = fct_inorder(measure1),
         measure2 = fct_inorder(measure2))

correlate_long

## # A tibble: 16 × 4
##    measure2          measure1            cor nice_cor
##    <fct>             <fct>             <dbl>    <dbl>
##  1 median_age        median_age            1        1
##  2 median_age        percent_white        NA       NA
##  3 median_age        per_capita_income    NA       NA
##  4 median_age        median_rent          NA       NA
##  5 percent_white     median_age           NA       NA
##  6 percent_white     percent_white         1        1
##  7 percent_white     per_capita_income    NA       NA
##  8 percent_white     median_rent          NA       NA
##  9 per_capita_income median_age           NA       NA
## 10 per_capita_income percent_white        NA       NA
## 11 per_capita_income per_capita_income     1        1
## 12 per_capita_income median_rent          NA       NA
## 13 median_rent       median_age           NA       NA
## 14 median_rent       percent_white        NA       NA
## 15 median_rent       per_capita_income    NA       NA
## 16 median_rent       median_rent           1        1

ggplot(correlate_long, 
       aes(x = measure2, y = measure1, color = cor)) +
  geom_point(aes(size = abs(cor))) +
  scale_color_gradient2(low = "#E16462", mid = "white", high = "#0D0887",
                        limits = c(-1, 1)) +
  scale_size_area(max_size = 10, limits = c(-1, 1), guide = "none") +
  labs(x = NULL, y = NULL) +
  coord_equal() +
  theme_minimal() +
  theme(panel.grid = element_blank())

Bonus task 2! Marginal effects

This is also entirely optional but will be super useful if you use regression for anything in your own work.

For extra super bonus fun times, create a more complex model that predicts percent_dem that uses polynomial terms (e.g. age squared) and/or interaction terms (e.g. age × state). Plot predictions from the model, use marginaleffects() to find the slopes of those predictions at different values, and plot the slopes in a marginal effects plot. (The “Predicted values and marginal effects in 2022” section from the example will be indispensable here.)

reg_model_2 <- lm(percent_dem ~ median_age + I(median_age^2) + percent_white + per_capita_income 
                  + median_rent + per_capita_income*median_rent + state, 
                data = results_2016) 

model_tidied <- tidy(reg_model_2, conf.int = TRUE) %>%
  filter(!str_detect(term, "state")) %>% 
  filter(term != "(Intercept)")

ggplot(model_tidied,
       aes(x = estimate, y = term)) +
  geom_vline(xintercept = 0, color = "red", linetype = "dotted") +
  geom_pointrange(aes(xmin = conf.low, xmax = conf.high)) + 
  labs(x = "Coefficient estimate", y = NULL) +
  theme_minimal()

my_predictions_2 <- predictions(
  reg_model_2,
  newdata = datagrid(median_age = seq(20, 70, by = 10),
                     state = "Georgia")) 

my_predictions_2 %>%
  select(median_age, predicted, std.error, conf.low, conf.high)

##   median_age predicted std.error conf.low conf.high
## 1         20  32.54470 1.6908958 29.23061  35.85880
## 2         30  26.32525 0.8009805 24.75535  27.89514
## 3         40  25.10351 0.6240346 23.88042  26.32659
## 4         50  28.87948 0.7410882 27.42698  30.33199
## 5         60  37.65318 1.5111420 34.69139  40.61496
## 6         70  51.42458 3.0775161 45.39276  57.45641

my_predictions_plot_2 <- ggplot(my_predictions_2, aes(x = median_age, y = predicted)) +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high),
              fill = "#0015BC", alpha = 0.5) + 
  geom_line(size = 1, color = "dark blue") +
  labs(x = "Median Age", y = "Predicted Democrats Percent") +
  theme_minimal() 

ggsave(my_predictions_plot_2, filename = "pred_plot_2.png", width = 8, height = 4.5)

2016 US Elections