Beeswarm and Violin

library(tidyverse)

## ── Attaching packages ────────────────

## ✔ ggplot2 3.0.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.6
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0

## ── Conflicts ─────────────────────────
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(ggbeeswarm)

diamonds %>% sample_n(1000) %>% filter(cut %in% c("Ideal","Very Good")) %>% 
  mutate(ppc = price/carat) -> divg
divg %>% 
  ggplot(aes(ppc)) +
  geom_density() +
  facet_wrap(~cut,ncol=2)

divg %>% ggplot(aes(cut,ppc)) + 
  geom_beeswarm()

divg %>% ggplot(aes(cut,ppc)) +
  geom_violin()

Which cell in the diamonds dataset has the highest mean value of ppc?

diamonds %>% 
  mutate(ppc = price/carat,
         vgroup = paste(cut,color,clarity,sep="_")) %>% 
  group_by(vgroup) %>% 
  summarize(mppc = mean(ppc),
            count = n()) %>% 
  arrange(desc(mppc)) -> dsort
head(dsort)

Exercise

Let’s look at the distribution of ppc for high value (> 9000) cells.

diamonds %>% 
  mutate(ppc = price/carat,
         vgroup = paste(cut,color,clarity,sep="_")) %>% 
  filter(vgroup %in% 
           c("Very Good_D_IF",
             "Good_D_IF",
             "Premium_D_IF",
             "Ideal_D_IF")) -> hival

hival %>% ggplot(aes(vgroup,ppc)) + geom_violin()