STATISTICS & ANALYSIS II

# Load libraries
library(tidyverse)
library(ggplot2)
library(gt)
library(glue)
library(ggtext)

# Load data and set dataframe
anscombe_tidy <- read_csv("anscombes.csv")

Step 1: Create dataframe with summary statistics

Before I plot my anscombe_tidy dataframe, I need to prepare the summary statistics that I’ll use as labels. I can calculate summary statistics using the functions group_by and summarize and then set the results as a new dataframe anscombe_summary.

# Create dataframe with summary statistics 

anscombe_summary <- anscombe_tidy %>%
  group_by(dataset) %>%
  summarise(
    mean_x = mean(x),
    mean_y = mean(y),
    sd_x = sd(x),
    sd_y = sd(y),
    r_square = cor(x,y)) %>%
  mutate_if(is.numeric, round, digits = 2)

anscombe_summary
## # A tibble: 4 x 6
##   dataset mean_x mean_y  sd_x  sd_y r_square
##   <chr>    <dbl>  <dbl> <dbl> <dbl>    <dbl>
## 1 I            9    7.5  3.32  2.03     0.82
## 2 II           9    7.5  3.32  2.03     0.82
## 3 III          9    7.5  3.32  2.03     0.82
## 4 IV           9    7.5  3.32  2.03     0.82


Step 2: Plot Anscombe’s quartet with summary statistics

Now that I’ve calculated the summary statistics I’ll need, I can go ahead and plot Anscombe’s quartet using the function ggplot(). I can show the quartet as a series of small multiples with the function facet_wrap() and label each plot with the function geom_richtext().

# Plot Anscombe's quartet and label with summary statistics

dataset <- anscombe_summary$dataset
MeanX <- anscombe_summary$mean_x
MeanY <- anscombe_summary$mean_y
sdX <- anscombe_summary$sd_x
sdY <- anscombe_summary$sd_y

anscombe_tidy %>%
  ggplot(aes(x, y,)) +
  geom_point(aes(color = dataset)) +
  geom_point(data = anscombe_summary, aes (MeanX, MeanY), color = "black", size = 4, shape = 19) +
  geom_point(data = anscombe_summary, aes (MeanX + sdX, MeanY+ sdY), color = "black", size = 3, shape = 15) +
  geom_point(data = anscombe_summary, aes (MeanX - sdX, MeanY- sdY), color = "black", size = 3, shape = 15) +
  geom_smooth(method=lm, se=TRUE, color = "blue") +
 geom_richtext(data=anscombe_summary, 
               aes(x=mean_x+7, 
                   y=mean_x+5, 
                   color = dataset,
                   label= glue("r^2 = {r_square}"),
                   fill = after_scale(alpha(color, .2))))+
   geom_richtext(data=anscombe_summary, 
               aes(x=mean_x, 
                   y=mean_x+4, 
                   color = dataset,
                   label= glue("mean = <br> ({MeanX}, {MeanY})"),
                   fill = after_scale(alpha(color, .2))))+
  geom_richtext(data=anscombe_summary, 
               aes(x=sd_x+10, 
                   y=sd_y+3, 
                   color = dataset,
                   label= glue(" sd = <br> +-({sdX}, {sdY})"),
                   fill = after_scale(alpha(color, .2))))+
  theme(legend.position = "none") +
  facet_wrap(~ dataset, ncol = 2) +
  labs(title = "Absombe's Quartet", 
       x= "X", 
       y= "Y",
       subtitle = "Demonstrates why visualization is necessary",      
       caption = "4 Data Sets")