# library
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
head(mpg, n = 10)
## # A tibble: 10 × 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto… f 18 29 p comp…
## 2 audi a4 1.8 1999 4 manu… f 21 29 p comp…
## 3 audi a4 2 2008 4 manu… f 20 31 p comp…
## 4 audi a4 2 2008 4 auto… f 21 30 p comp…
## 5 audi a4 2.8 1999 6 auto… f 16 26 p comp…
## 6 audi a4 2.8 1999 6 manu… f 18 26 p comp…
## 7 audi a4 3.1 2008 6 auto… f 18 27 p comp…
## 8 audi a4 quattro 1.8 1999 4 manu… 4 18 26 p comp…
## 9 audi a4 quattro 1.8 1999 4 auto… 4 16 25 p comp…
## 10 audi a4 quattro 2 2008 4 manu… 4 20 28 p comp…
mpg %>%
ggplot(aes(class, hwy, fill = drv)) +
geom_boxplot() +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45)) +
labs(title = "Highway Gas Mileage per Car Class and Wheel Drive",
x = "Motor Vehicle Class",
y = "Gas Mileage on Highways (mpg)",
fill = "Wheel Drive Type")
Questions:
1.Which vehicle class has the highest median highway mpg?
compact followed closely by subcompact
subcompact
Create an alteration of this plot where the outliers are shown in a way in which they do not overlap (consider jittering and coloring).
mpg %>%
ggplot(aes(class, hwy, fill = drv)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(width = 0.5, height = 0.5, alpha = 0.4) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45)) +
labs(title = "Highway Gas Mileage per Car Class and Wheel Drive",
x = "Motor Vehicle Class",
y = "Gas Mileage on Highways (mpg)",
fill = "Wheel Drive Type")
mpg %>%
ggplot(aes(class, cty)) +
geom_violin(fill = "pink") +
geom_boxplot(alpha = 0.4, fill = "lightblue") +
theme_minimal() +
labs(title = "City Gas Mileage per Car Type",
x = "Motor Vehicle Class",
y = "In City Gas Mileage (mpg)")
Questions:
How does the violin plot help you understand the distribution
compared to the boxplot alone?
violin plots show density distributions of data, unlike boxplots which don't show were most data tends to lean lean towards, violin shows you where the most and least amount of data points lie.
Are there any classes with unusual distributions or outliers?
compact and subcompact have large outliers skewing up. The 2-seater looks to have a very compact distribution of data.
diamonds %>%
ggplot(aes(cut)) +
geom_bar() +
theme_minimal()
diamonds %>%
ggplot(aes(cut, fill = color)) +
geom_bar(position = "fill") +
theme_minimal()
diamonds %>%
ggplot(aes(cut, fill = color)) +
geom_bar(position = "dodge") +
theme_minimal()
Questions:
What does the stacked barplot show that the simple barplot does
not?
it shows another dimension of the data by combining a categorical variable into the bar plot.
When is the grouped barplot more useful than the stacked barplot?
grouped barplots allow for better comparisons across multiple cartegories.
Dataset: diamonds
library(dplyr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.2.0 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#head(diamonds, n = 5)
d1 <- diamonds %>%
group_by(cut) %>%
summarise(mean_price = mean(price))
d1 %>%
ggplot(aes(cut, mean_price, fill = cut)) +
geom_col() +
theme_minimal() +
theme() +
labs(title = "Average Diamond Price by Cut",
x = "Cut",
y = "Average Diamond price",
caption = "Figure 1. Average diamond price by cut")
Create the same plot using the stat_summary approach. Add error bars to the plot.
diamonds %>%
ggplot(aes(cut, price)) +
stat_summary(fun = mean,
geom = "col") +
stat_summary(fun.data = mean_se,
geom = "errorbar",
width = 0.3) +
theme_minimal()