Visualizing the Summary Statistics of Anscombe’s Quartet

#I appreciated how straightforward this assignment was once I figured out how simple it can be to wrap the quartet of charts - it felt like a lightbulb moment in my R journey. It was also fun to explore the new `ggpubr` package that adds the linear regression equation and R² value directly onto the plots.

#load packages and import data
library(tidyverse)
library(ggpubr)
anscombes <- read.csv("./anscombes.csv")

#At first I started by splitting the dataset into separate frames, 
#and plotting each one individually. 
#I then realized that all of this was completely unnecessary for faceting into one output
#and there was a much easier way to do it.

#split into component datasets
setI <- anscombes[-c(1)] %>%
  filter(dataset == "I")

setII <- anscombes[-c(1)] %>%
  filter(dataset == "II")

setIII <- anscombes[-c(1)] %>%
  filter(dataset == "III")

setIV <- anscombes[-c(1)] %>%
  filter(dataset == "IV")

#create individual plots
plotI <- ggplot(setI, aes(x=x, y=y)) +
  geom_point() +
  xlim(c(2,18)) +
  ylim(c(2,12)) +
  theme_light()

plotII <- ggplot(setII, aes(x=x, y=y)) +
  geom_point() +
  xlim(c(2,18)) +
  ylim(c(2,12)) +
  theme_light()

plotIII <- ggplot(setIII, aes(x=x, y=y)) +
  geom_point() +
  xlim(c(2,18)) +
  ylim(c(2,12)) +
  theme_light()

plotIV <- ggplot(setIV, aes(x=x, y=y)) +
  geom_point() +
  xlim(c(2,18)) +
  ylim(c(2,12)) +
  theme_light()

#create faceted plots
#note: I ended up extending both of the axes to make more room for the annotations
facetplots <- ggplot(anscombes, aes(x = x, y = y)) +
  geom_point(shape = 21, size = 4, color = "black", fill = "darkolivegreen") +
  xlim(c(0,25)) +
  ylim(c(0,20)) +
  theme_light(base_size = 16) +
  facet_wrap(~dataset)

#generate dataframe of summary statistics
#note: format() with the argument "nsmall" was my (perhaps inelegant) solution 
#to forcing trailing zero decimals onto the whole numbers
summarystats.df <- 
  anscombes %>%
  group_by(dataset) %>%
  summarise(xmeans = mean(x), ymeans = mean(y), xSD = sd(x), ySD = sd(y)) %>%
  mutate(label_xmeans = paste0("x mean: ", sep = "\n", format(xmeans, nsmall = 4)),
         label_ymeans = paste0("y mean: ", sep = "\n", format(round(ymeans, digits = 4), nsmall = 2)), 
         label_xSD = paste0("x SD: ", sep = "\n", format(round(xSD, digits = 4), nsmall = 2)), 
         label_ySD = paste0("y SD: ", sep = "\n", format(round(ySD, digits = 4), nsmall = 2)))

#add summary stats for each dataset to faceted plots
facetplots.labels <- 
  facetplots +
  geom_vline(data = summarystats.df, aes(xintercept = xmeans), color = "skyblue2", linewidth = 1) +
  geom_label(data = summarystats.df, aes(x = xmeans, label = label_xmeans, y = 18), color = "skyblue2", label.size = 1) +
  
  geom_hline(data = summarystats.df, aes(yintercept = ymeans), color = "deeppink4", linewidth = 1) +
  geom_label(data = summarystats.df, aes(y = ymeans, label = label_ymeans, x = 23), color = "deeppink4", label.size = 1) +
  
  geom_vline(data = summarystats.df, aes(xintercept = (xmeans-xSD)), color = "skyblue2", linewidth = 0.6, linetype = "dashed") +
  geom_vline(data = summarystats.df, aes(xintercept = (xmeans+xSD)), color = "skyblue2", linewidth = 0.6, linetype = "dashed") +
  geom_label(data = summarystats.df, aes(x = (xmeans-xSD), label = label_xSD, y = 14), color = "skyblue2", label.size = 1) +
  #I initially had the SD lines at their numeric positions, 
  #but then realized they should be (SD units) AWAY from the mean line instead
  geom_hline(data = summarystats.df, aes(yintercept = (ymeans-ySD)), color = "deeppink4", linewidth = 0.6, linetype = "dashed") +
  geom_hline(data = summarystats.df, aes(yintercept = (ymeans+ySD)), color = "deeppink4", linewidth = 0.6, linetype = "dashed") +
  geom_label(data = summarystats.df, aes(y = (ymeans-ySD), label = label_ySD, x = 18), color = "deeppink4", label.size = 1)

#add linear regression lines and r^2
facetplots.labels +
  geom_smooth(method="lm", color = "black") +
  stat_regline_equation(label.x = 18, label.y = 18) +
  stat_cor(aes(label = after_stat(rr.label)), label.x = 18, label.y = 17)