library(tidyverse)
## ── Attaching packages ────────────────
## ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.6
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggbeeswarm)
diamonds %>% sample_n(1000) %>% filter(cut %in% c("Ideal","Very Good")) %>%
mutate(ppc = price/carat) -> divg
divg %>%
ggplot(aes(ppc)) +
geom_density() +
facet_wrap(~cut,ncol=2)
divg %>% ggplot(aes(cut,ppc)) +
geom_beeswarm()
divg %>% ggplot(aes(cut,ppc)) +
geom_violin()
Which cell in the diamonds dataset has the highest mean value of ppc?
diamonds %>%
mutate(ppc = price/carat,
vgroup = paste(cut,color,clarity,sep="_")) %>%
group_by(vgroup) %>%
summarize(mppc = mean(ppc),
count = n()) %>%
arrange(desc(mppc)) -> dsort
head(dsort)
Let’s look at the distribution of ppc for high value (> 9000) cells.
diamonds %>%
mutate(ppc = price/carat,
vgroup = paste(cut,color,clarity,sep="_")) %>%
filter(vgroup %in%
c("Very Good_D_IF",
"Good_D_IF",
"Premium_D_IF",
"Ideal_D_IF")) -> hival
hival %>% ggplot(aes(vgroup,ppc)) + geom_violin()